RunanywhereAI
diff --git a/‎sdk/runanywhere-commons/include/rac/backends/rac_llm_llamacpp.h‎
Lines changed: 0 additions & 15 deletions b/‎sdk/runanywhere-commons/include/rac/backends/rac_llm_llamacpp.h‎
Lines changed: 0 additions & 15 deletions
diff --git a/‎sdk/runanywhere-commons/include/rac/features/llm/rac_llm_service.h‎
Lines changed: 0 additions & 23 deletions b/‎sdk/runanywhere-commons/include/rac/features/llm/rac_llm_service.h‎
Lines changed: 0 additions & 23 deletions
diff --git a/‎sdk/runanywhere-commons/include/rac/features/rag/rac_rag_pipeline.h‎
Lines changed: 11 additions & 9 deletions b/‎sdk/runanywhere-commons/include/rac/features/rag/rac_rag_pipeline.h‎
Lines changed: 11 additions & 9 deletions
diff --git a/‎sdk/runanywhere-commons/scripts/build-ios.sh‎
Lines changed: 1 addition & 0 deletions b/‎sdk/runanywhere-commons/scripts/build-ios.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp‎
Lines changed: 0 additions & 100 deletions b/‎sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp‎
Lines changed: 0 additions & 100 deletions
diff --git a/‎sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h‎
Lines changed: 0 additions & 14 deletions b/‎sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h‎
Lines changed: 0 additions & 14 deletions
diff --git a/‎sdk/runanywhere-commons/src/backends/llamacpp/rac_backend_llamacpp_register.cpp‎
Lines changed: 1 addition & 6 deletions b/‎sdk/runanywhere-commons/src/backends/llamacpp/rac_backend_llamacpp_register.cpp‎
Lines changed: 1 addition & 6 deletions
diff --git a/‎sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp‎
Lines changed: 1 addition & 21 deletions b/‎sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp‎
Lines changed: 1 addition & 21 deletions
diff --git a/‎sdk/runanywhere-commons/src/features/llm/rac_llm_service.cpp‎
Lines changed: 0 additions & 11 deletions b/‎sdk/runanywhere-commons/src/features/llm/rac_llm_service.cpp‎
Lines changed: 0 additions & 11 deletions
diff --git a/‎sdk/runanywhere-commons/src/features/rag/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎sdk/runanywhere-commons/src/features/rag/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
@@ -266,21 +266,6 @@ RAC_LLAMACPP_API rac_result_t rac_llm_llamacpp_inject_system_prompt(rac_handle_t
 RAC_LLAMACPP_API rac_result_t rac_llm_llamacpp_append_context(rac_handle_t handle,
                                                                const char* text);
 
-/**
- * Probe confidence that accumulated context answers a query.
- * Non-destructive to KV cache — probe tokens are cleaned up.
- *
- * @param handle Service handle
- * @param context Context passage (can be empty if context is in KV cache)
- * @param query The user question
- * @param out_confidence Output: confidence in [0.0, 1.0]
- * @return RAC_SUCCESS or error code
- */
-RAC_LLAMACPP_API rac_result_t rac_llm_llamacpp_probe_confidence(rac_handle_t handle,
-                                                                 const char* context,
-                                                                 const char* query,
-                                                                 float* out_confidence);
-
 /**
  * Generate response from accumulated KV cache state.
  * Unlike rac_llm_llamacpp_generate(), does NOT clear the KV cache first.
 
@@ -68,13 +68,6 @@ typedef struct rac_llm_service_ops {
     /** Append text to KV cache after current content (optional, NULL if not supported) */
     rac_result_t (*append_context)(void* impl, const char* text);
 
-    /**
-     * Probe confidence that accumulated context answers query (optional, NULL if not supported).
-     * Returns confidence in [0.0, 1.0] via out_confidence. Non-destructive to KV cache.
-     */
-    rac_result_t (*probe_confidence)(void* impl, const char* context, const char* query,
-                                     float* out_confidence);
-
     /**
      * Generate response from accumulated KV cache state (optional, NULL if not supported).
      * Unlike generate(), does NOT clear KV cache first.
@@ -220,22 +213,6 @@ RAC_API rac_result_t rac_llm_inject_system_prompt(rac_handle_t handle, const cha
  */
 RAC_API rac_result_t rac_llm_append_context(rac_handle_t handle, const char* text);
 
-/**
- * @brief Probe whether accumulated context answers a query
- *
- * Uses logit probing (Yes/No softmax) to estimate confidence.
- * Non-destructive — probe tokens are removed from KV cache after probing.
- * Optional — returns RAC_ERROR_NOT_SUPPORTED if backend doesn't support it.
- *
- * @param handle Service handle
- * @param context Context passage (can be empty string if context is already in KV cache)
- * @param query The user question
- * @param out_confidence Output: confidence in [0.0, 1.0]
- * @return RAC_SUCCESS or error code
- */
-RAC_API rac_result_t rac_llm_probe_confidence(rac_handle_t handle, const char* context,
-                                               const char* query, float* out_confidence);
-
 /**
  * @brief Generate a response from accumulated KV cache state
  *
 
@@ -65,16 +65,18 @@ typedef struct rac_rag_pipeline_config {
     /** Number of top chunks to retrieve (default 10) */
     size_t top_k;
 
-    /** Minimum similarity threshold 0.0-1.0 (default 0.15) */
+    /**
+     * Minimum similarity threshold 0.0-1.0 (default 0.15).
+     */
     float similarity_threshold;
 
     /** Maximum tokens for context (default 2048) */
     size_t max_context_tokens;
 
-    /** Tokens per chunk when splitting documents (default 512) */
+    /** Tokens per chunk when splitting documents (default 180) */
     size_t chunk_size;
 
-    /** Overlap tokens between chunks (default 50) */
+    /** Overlap tokens between chunks (default 30) */
     size_t chunk_overlap;
 
     /** Prompt template with {context} and {query} placeholders (optional) */
@@ -85,13 +87,13 @@ typedef struct rac_rag_pipeline_config {
  * @brief Get default RAG pipeline configuration
  */
 static inline rac_rag_pipeline_config_t rac_rag_pipeline_config_default(void) {
-    rac_rag_pipeline_config_t cfg;
+    rac_rag_pipeline_config_t cfg = {0};
     cfg.embedding_dimension = 384;
     cfg.top_k = 10;
     cfg.similarity_threshold = 0.15f;
     cfg.max_context_tokens = 2048;
-    cfg.chunk_size = 512;
-    cfg.chunk_overlap = 50;
+    cfg.chunk_size = 180;
+    cfg.chunk_overlap = 30;
     cfg.prompt_template = NULL;
     return cfg;
 }
@@ -114,15 +116,15 @@ typedef struct rac_rag_config {
 } rac_rag_config_t;
 
 static inline rac_rag_config_t rac_rag_config_default(void) {
-    rac_rag_config_t cfg;
+    rac_rag_config_t cfg = {0};
     cfg.embedding_model_path = NULL;
     cfg.llm_model_path = NULL;
     cfg.embedding_dimension = 384;
     cfg.top_k = 10;
     cfg.similarity_threshold = 0.15f;
     cfg.max_context_tokens = 2048;
-    cfg.chunk_size = 512;
-    cfg.chunk_overlap = 50;
+    cfg.chunk_size = 180;
+    cfg.chunk_overlap = 30;
     cfg.prompt_template = NULL;
     cfg.embedding_config_json = NULL;
     cfg.llm_config_json = NULL;
 
@@ -505,6 +505,7 @@ create_backend_xcframework() {
 
         for possible_path in \
             "${PLATFORM_DIR}/src/backends/${BACKEND_NAME}/librac_backend_${BACKEND_NAME}.a" \
+            "${PLATFORM_DIR}/src/features/${BACKEND_NAME}/librac_backend_${BACKEND_NAME}.a" \
             "${PLATFORM_DIR}/${XCODE_SUBDIR}/librac_backend_${BACKEND_NAME}.a" \
             "${PLATFORM_DIR}/librac_backend_${BACKEND_NAME}.a" \
             "${PLATFORM_DIR}/backends/${BACKEND_NAME}/librac_backend_${BACKEND_NAME}.a"; do
 
@@ -762,106 +762,6 @@ void LlamaCppTextGeneration::cancel() {
     LOGI("Generation cancel requested");
 }
 
-float LlamaCppTextGeneration::probe_confidence(const std::string& context,
-                                               const std::string& query) {
-    std::lock_guard<std::mutex> lock(mutex_);
-
-    if (!is_ready()) {
-        LOGE("probe_confidence: model not ready");
-        return 0.5f;
-    }
-
-    const std::string probe_prompt =
-        context + "\n" + query + "\nDoes this answer the question? (Yes/No):";
-
-    LOGI("probe_confidence: prompt_len=%zu", probe_prompt.size());
-
-    const auto probe_tokens = common_tokenize(context_, probe_prompt, false, false);
-
-    if (probe_tokens.empty()) {
-        LOGE("probe_confidence: tokenization produced no tokens");
-        return 0.5f;
-    }
-
-    const int n_ctx = llama_n_ctx(context_);
-    const int n_probe = static_cast<int>(probe_tokens.size());
-
-    if (n_probe >= n_ctx) {
-        LOGE("probe_confidence: probe prompt too long (%d tokens, ctx=%d)", n_probe, n_ctx);
-        return 0.5f;
-    }
-
-    llama_memory_t mem = llama_get_memory(context_);
-    const llama_pos probe_start_pos = mem
-        ? (llama_memory_seq_pos_max(mem, 0) + 1)
-        : 0;
-
-    llama_batch batch = llama_batch_init(n_ctx, 0, 1);
-    batch.n_tokens = 0;
-
-    for (int i = 0; i < n_probe; ++i) {
-        const bool need_logits = (i == n_probe - 1);
-        common_batch_add(batch, probe_tokens[i], probe_start_pos + i, {0}, need_logits);
-    }
-
-    if (llama_decode(context_, batch) != 0) {
-        LOGE("probe_confidence: llama_decode failed");
-        llama_batch_free(batch);
-        return 0.5f;
-    }
-
-    llama_batch_free(batch);
-
-    float* logits = llama_get_logits_ith(context_, -1);
-    if (!logits) {
-        LOGE("probe_confidence: failed to get logits");
-        if (mem) {
-            llama_memory_seq_rm(mem, 0, probe_start_pos, -1);
-        }
-        return 0.5f;
-    }
-
-    const auto vocab = llama_model_get_vocab(model_);
-    const int32_t n_vocab = llama_vocab_n_tokens(vocab);
-
-    auto get_first_token = [&](const std::string& word) -> llama_token {
-        std::vector<llama_token> toks(8);
-        int n = llama_tokenize(vocab, word.c_str(), static_cast<int32_t>(word.size()),
-                               toks.data(), static_cast<int32_t>(toks.size()),
-                               false,  // add_special
-                               false); // parse_special
-        if (n > 0 && toks[0] >= 0 && toks[0] < n_vocab) return toks[0];
-        return -1;
-    };
-
-    llama_token yes_token = get_first_token(" Yes");
-    if (yes_token < 0) yes_token = get_first_token("Yes");
-
-    llama_token no_token = get_first_token(" No");
-    if (no_token < 0) no_token = get_first_token("No");
-
-    float confidence = 0.5f;
-
-    if (yes_token >= 0 && yes_token < n_vocab && no_token >= 0 && no_token < n_vocab) {
-        const float logit_yes = logits[yes_token];
-        const float logit_no  = logits[no_token];
-        const float max_logit = std::max(logit_yes, logit_no);
-        const float exp_yes   = std::exp(logit_yes - max_logit);
-        const float exp_no    = std::exp(logit_no  - max_logit);
-        confidence = exp_yes / (exp_yes + exp_no);
-        LOGI("probe_confidence: yes_token=%d, no_token=%d, logit_yes=%.4f, logit_no=%.4f, confidence=%.4f",
-             yes_token, no_token, logit_yes, logit_no, confidence);
-    } else {
-        LOGE("probe_confidence: could not find Yes/No tokens (yes=%d, no=%d)", yes_token, no_token);
-    }
-
-    if (mem) {
-        llama_memory_seq_rm(mem, 0, probe_start_pos, -1);
-        LOGI("probe_confidence: removed probe tokens from KV cache (pos %d onwards)", probe_start_pos);
-    }
-
-    return confidence;
-}
 
 bool LlamaCppTextGeneration::inject_system_prompt(const std::string& prompt) {
     std::lock_guard<std::mutex> lock(mutex_);
 
@@ -135,20 +135,6 @@ class LlamaCppTextGeneration {
                          int* out_prompt_tokens);
     void cancel();
 
-    /**
-     * @brief Check whether the given context answers the query, using logit probing.
-     *
-     * Formats a Yes/No question, runs llama_decode for prefill only (no generation),
-     * extracts logits for the "Yes" and "No" tokens at the last position, and computes
-     * confidence via softmax. Probe tokens are removed from the KV cache before returning.
-     *
-     * @param context The context passage (retrieved sentence or accumulated sentences)
-     * @param query   The user query to check against the context
-     * @return Confidence score in [0.0, 1.0] — higher means context likely answers query.
-     *         Returns 0.5 on error (neutral / unknown).
-     */
-    float probe_confidence(const std::string& context, const std::string& query);
-
     /**
      * @brief Inject a system prompt into the KV cache at position 0.
      * Clears existing KV cache first, then decodes the prompt tokens.
 
@@ -135,10 +135,6 @@ static rac_result_t llamacpp_vtable_append_context(void* impl, const char* text)
     return rac_llm_llamacpp_append_context(impl, text);
 }
 
-static rac_result_t llamacpp_vtable_probe_confidence(void* impl, const char* context,
-                                                     const char* query, float* out_confidence) {
-    return rac_llm_llamacpp_probe_confidence(impl, context, query, out_confidence);
-}
 
 static rac_result_t llamacpp_vtable_generate_from_context(void* impl, const char* query,
                                                           const rac_llm_options_t* options,
@@ -165,8 +161,7 @@ static const rac_llm_service_ops_t g_llamacpp_ops = {
     .get_lora_info = llamacpp_vtable_get_lora_info,
     .inject_system_prompt = llamacpp_vtable_inject_system_prompt,
     .append_context = llamacpp_vtable_append_context,
-    .probe_confidence = llamacpp_vtable_probe_confidence,
-    .generate_from_context = llamacpp_vtable_generate_from_context,
+.generate_from_context = llamacpp_vtable_generate_from_context,
     .clear_context = llamacpp_vtable_clear_context,
 };
 
 
@@ -442,26 +442,6 @@ rac_result_t rac_llm_llamacpp_append_context(rac_handle_t handle, const char* te
     }
 }
 
-rac_result_t rac_llm_llamacpp_probe_confidence(rac_handle_t handle, const char* context,
-                                                const char* query, float* out_confidence) {
-    if (handle == nullptr || query == nullptr || out_confidence == nullptr) {
-        return RAC_ERROR_NULL_POINTER;
-    }
-
-    auto* h = static_cast<rac_llm_llamacpp_handle_impl*>(handle);
-    if (!h->text_gen) {
-        return RAC_ERROR_INVALID_HANDLE;
-    }
-
-    try {
-        *out_confidence = h->text_gen->probe_confidence(context ? context : "", query);
-        return RAC_SUCCESS;
-    } catch (const std::exception& e) {
-        rac_error_set_details(e.what());
-        *out_confidence = 0.5f;
-        return RAC_ERROR_INFERENCE_FAILED;
-    }
-}
 
 rac_result_t rac_llm_llamacpp_generate_from_context(rac_handle_t handle, const char* query,
                                                      const rac_llm_options_t* options,
@@ -475,7 +455,7 @@ rac_result_t rac_llm_llamacpp_generate_from_context(rac_handle_t handle, const c
         return RAC_ERROR_INVALID_HANDLE;
     }
 
-    TextGenerationRequest request;
+    runanywhere::TextGenerationRequest request;
     request.prompt = query;
     if (options != nullptr) {
         request.max_tokens = options->max_tokens;
 
@@ -243,17 +243,6 @@ rac_result_t rac_llm_append_context(rac_handle_t handle, const char* text) {
     return service->ops->append_context(service->impl, text);
 }
 
-rac_result_t rac_llm_probe_confidence(rac_handle_t handle, const char* context,
-                                       const char* query, float* out_confidence) {
-    if (!handle || !query || !out_confidence)
-        return RAC_ERROR_NULL_POINTER;
-
-    auto* service = static_cast<rac_llm_service_t*>(handle);
-    if (!service->ops || !service->ops->probe_confidence)
-        return RAC_ERROR_NOT_SUPPORTED;
-
-    return service->ops->probe_confidence(service->impl, context, query, out_confidence);
-}
 
 rac_result_t rac_llm_generate_from_context(rac_handle_t handle, const char* query,
                                             const rac_llm_options_t* options,
 
@@ -57,6 +57,7 @@ set(RAG_PIPELINE_SOURCES
     rag_backend.cpp
     vector_store_usearch.cpp
     rag_chunker.cpp
+    bm25_index.cpp
     rac_rag_register.cpp
     rac_rag_pipeline.cpp
 )
@@ -73,6 +74,7 @@ set(RAG_PIPELINE_HEADERS
     rag_backend.h
     vector_store_usearch.h
     rag_chunker.h
+    bm25_index.h
 )
 
 if(RAC_BUILD_SHARED)
Original file line number	Diff line number	Diff line change
`@@ -57,6 +57,7 @@ set(RAG_PIPELINE_SOURCES`
`57`	`57`	`rag_backend.cpp`
`58`	`58`	`vector_store_usearch.cpp`
`59`	`59`	`rag_chunker.cpp`
	`60`	`+ bm25_index.cpp`
`60`	`61`	`rac_rag_register.cpp`
`61`	`62`	`rac_rag_pipeline.cpp`
`62`	`63`	`)`
`@@ -73,6 +74,7 @@ set(RAG_PIPELINE_HEADERS`
`73`	`74`	`rag_backend.h`
`74`	`75`	`vector_store_usearch.h`
`75`	`76`	`rag_chunker.h`
	`77`	`+ bm25_index.h`
`76`	`78`	`)`
`77`	`79`
`78`	`80`	`if(RAC_BUILD_SHARED)`