Skip to content

Commit 93a0cfe

Browse files
VyasGurushubhammalhotra28
authored andcommitted
Optimised RAG + implement a hybrid search
1 parent 1096408 commit 93a0cfe

26 files changed

Lines changed: 1126 additions & 560 deletions

sdk/runanywhere-commons/include/rac/backends/rac_llm_llamacpp.h

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -266,21 +266,6 @@ RAC_LLAMACPP_API rac_result_t rac_llm_llamacpp_inject_system_prompt(rac_handle_t
266266
RAC_LLAMACPP_API rac_result_t rac_llm_llamacpp_append_context(rac_handle_t handle,
267267
const char* text);
268268

269-
/**
270-
* Probe confidence that accumulated context answers a query.
271-
* Non-destructive to KV cache — probe tokens are cleaned up.
272-
*
273-
* @param handle Service handle
274-
* @param context Context passage (can be empty if context is in KV cache)
275-
* @param query The user question
276-
* @param out_confidence Output: confidence in [0.0, 1.0]
277-
* @return RAC_SUCCESS or error code
278-
*/
279-
RAC_LLAMACPP_API rac_result_t rac_llm_llamacpp_probe_confidence(rac_handle_t handle,
280-
const char* context,
281-
const char* query,
282-
float* out_confidence);
283-
284269
/**
285270
* Generate response from accumulated KV cache state.
286271
* Unlike rac_llm_llamacpp_generate(), does NOT clear the KV cache first.

sdk/runanywhere-commons/include/rac/features/llm/rac_llm_service.h

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -68,13 +68,6 @@ typedef struct rac_llm_service_ops {
6868
/** Append text to KV cache after current content (optional, NULL if not supported) */
6969
rac_result_t (*append_context)(void* impl, const char* text);
7070

71-
/**
72-
* Probe confidence that accumulated context answers query (optional, NULL if not supported).
73-
* Returns confidence in [0.0, 1.0] via out_confidence. Non-destructive to KV cache.
74-
*/
75-
rac_result_t (*probe_confidence)(void* impl, const char* context, const char* query,
76-
float* out_confidence);
77-
7871
/**
7972
* Generate response from accumulated KV cache state (optional, NULL if not supported).
8073
* Unlike generate(), does NOT clear KV cache first.
@@ -220,22 +213,6 @@ RAC_API rac_result_t rac_llm_inject_system_prompt(rac_handle_t handle, const cha
220213
*/
221214
RAC_API rac_result_t rac_llm_append_context(rac_handle_t handle, const char* text);
222215

223-
/**
224-
* @brief Probe whether accumulated context answers a query
225-
*
226-
* Uses logit probing (Yes/No softmax) to estimate confidence.
227-
* Non-destructive — probe tokens are removed from KV cache after probing.
228-
* Optional — returns RAC_ERROR_NOT_SUPPORTED if backend doesn't support it.
229-
*
230-
* @param handle Service handle
231-
* @param context Context passage (can be empty string if context is already in KV cache)
232-
* @param query The user question
233-
* @param out_confidence Output: confidence in [0.0, 1.0]
234-
* @return RAC_SUCCESS or error code
235-
*/
236-
RAC_API rac_result_t rac_llm_probe_confidence(rac_handle_t handle, const char* context,
237-
const char* query, float* out_confidence);
238-
239216
/**
240217
* @brief Generate a response from accumulated KV cache state
241218
*

sdk/runanywhere-commons/include/rac/features/rag/rac_rag_pipeline.h

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -65,16 +65,18 @@ typedef struct rac_rag_pipeline_config {
6565
/** Number of top chunks to retrieve (default 10) */
6666
size_t top_k;
6767

68-
/** Minimum similarity threshold 0.0-1.0 (default 0.15) */
68+
/**
69+
* Minimum similarity threshold 0.0-1.0 (default 0.15).
70+
*/
6971
float similarity_threshold;
7072

7173
/** Maximum tokens for context (default 2048) */
7274
size_t max_context_tokens;
7375

74-
/** Tokens per chunk when splitting documents (default 512) */
76+
/** Tokens per chunk when splitting documents (default 180) */
7577
size_t chunk_size;
7678

77-
/** Overlap tokens between chunks (default 50) */
79+
/** Overlap tokens between chunks (default 30) */
7880
size_t chunk_overlap;
7981

8082
/** Prompt template with {context} and {query} placeholders (optional) */
@@ -85,13 +87,13 @@ typedef struct rac_rag_pipeline_config {
8587
* @brief Get default RAG pipeline configuration
8688
*/
8789
static inline rac_rag_pipeline_config_t rac_rag_pipeline_config_default(void) {
88-
rac_rag_pipeline_config_t cfg;
90+
rac_rag_pipeline_config_t cfg = {0};
8991
cfg.embedding_dimension = 384;
9092
cfg.top_k = 10;
9193
cfg.similarity_threshold = 0.15f;
9294
cfg.max_context_tokens = 2048;
93-
cfg.chunk_size = 512;
94-
cfg.chunk_overlap = 50;
95+
cfg.chunk_size = 180;
96+
cfg.chunk_overlap = 30;
9597
cfg.prompt_template = NULL;
9698
return cfg;
9799
}
@@ -114,15 +116,15 @@ typedef struct rac_rag_config {
114116
} rac_rag_config_t;
115117

116118
static inline rac_rag_config_t rac_rag_config_default(void) {
117-
rac_rag_config_t cfg;
119+
rac_rag_config_t cfg = {0};
118120
cfg.embedding_model_path = NULL;
119121
cfg.llm_model_path = NULL;
120122
cfg.embedding_dimension = 384;
121123
cfg.top_k = 10;
122124
cfg.similarity_threshold = 0.15f;
123125
cfg.max_context_tokens = 2048;
124-
cfg.chunk_size = 512;
125-
cfg.chunk_overlap = 50;
126+
cfg.chunk_size = 180;
127+
cfg.chunk_overlap = 30;
126128
cfg.prompt_template = NULL;
127129
cfg.embedding_config_json = NULL;
128130
cfg.llm_config_json = NULL;

sdk/runanywhere-commons/scripts/build-ios.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -505,6 +505,7 @@ create_backend_xcframework() {
505505

506506
for possible_path in \
507507
"${PLATFORM_DIR}/src/backends/${BACKEND_NAME}/librac_backend_${BACKEND_NAME}.a" \
508+
"${PLATFORM_DIR}/src/features/${BACKEND_NAME}/librac_backend_${BACKEND_NAME}.a" \
508509
"${PLATFORM_DIR}/${XCODE_SUBDIR}/librac_backend_${BACKEND_NAME}.a" \
509510
"${PLATFORM_DIR}/librac_backend_${BACKEND_NAME}.a" \
510511
"${PLATFORM_DIR}/backends/${BACKEND_NAME}/librac_backend_${BACKEND_NAME}.a"; do

sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp

Lines changed: 0 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -762,106 +762,6 @@ void LlamaCppTextGeneration::cancel() {
762762
LOGI("Generation cancel requested");
763763
}
764764

765-
float LlamaCppTextGeneration::probe_confidence(const std::string& context,
766-
const std::string& query) {
767-
std::lock_guard<std::mutex> lock(mutex_);
768-
769-
if (!is_ready()) {
770-
LOGE("probe_confidence: model not ready");
771-
return 0.5f;
772-
}
773-
774-
const std::string probe_prompt =
775-
context + "\n" + query + "\nDoes this answer the question? (Yes/No):";
776-
777-
LOGI("probe_confidence: prompt_len=%zu", probe_prompt.size());
778-
779-
const auto probe_tokens = common_tokenize(context_, probe_prompt, false, false);
780-
781-
if (probe_tokens.empty()) {
782-
LOGE("probe_confidence: tokenization produced no tokens");
783-
return 0.5f;
784-
}
785-
786-
const int n_ctx = llama_n_ctx(context_);
787-
const int n_probe = static_cast<int>(probe_tokens.size());
788-
789-
if (n_probe >= n_ctx) {
790-
LOGE("probe_confidence: probe prompt too long (%d tokens, ctx=%d)", n_probe, n_ctx);
791-
return 0.5f;
792-
}
793-
794-
llama_memory_t mem = llama_get_memory(context_);
795-
const llama_pos probe_start_pos = mem
796-
? (llama_memory_seq_pos_max(mem, 0) + 1)
797-
: 0;
798-
799-
llama_batch batch = llama_batch_init(n_ctx, 0, 1);
800-
batch.n_tokens = 0;
801-
802-
for (int i = 0; i < n_probe; ++i) {
803-
const bool need_logits = (i == n_probe - 1);
804-
common_batch_add(batch, probe_tokens[i], probe_start_pos + i, {0}, need_logits);
805-
}
806-
807-
if (llama_decode(context_, batch) != 0) {
808-
LOGE("probe_confidence: llama_decode failed");
809-
llama_batch_free(batch);
810-
return 0.5f;
811-
}
812-
813-
llama_batch_free(batch);
814-
815-
float* logits = llama_get_logits_ith(context_, -1);
816-
if (!logits) {
817-
LOGE("probe_confidence: failed to get logits");
818-
if (mem) {
819-
llama_memory_seq_rm(mem, 0, probe_start_pos, -1);
820-
}
821-
return 0.5f;
822-
}
823-
824-
const auto vocab = llama_model_get_vocab(model_);
825-
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
826-
827-
auto get_first_token = [&](const std::string& word) -> llama_token {
828-
std::vector<llama_token> toks(8);
829-
int n = llama_tokenize(vocab, word.c_str(), static_cast<int32_t>(word.size()),
830-
toks.data(), static_cast<int32_t>(toks.size()),
831-
false, // add_special
832-
false); // parse_special
833-
if (n > 0 && toks[0] >= 0 && toks[0] < n_vocab) return toks[0];
834-
return -1;
835-
};
836-
837-
llama_token yes_token = get_first_token(" Yes");
838-
if (yes_token < 0) yes_token = get_first_token("Yes");
839-
840-
llama_token no_token = get_first_token(" No");
841-
if (no_token < 0) no_token = get_first_token("No");
842-
843-
float confidence = 0.5f;
844-
845-
if (yes_token >= 0 && yes_token < n_vocab && no_token >= 0 && no_token < n_vocab) {
846-
const float logit_yes = logits[yes_token];
847-
const float logit_no = logits[no_token];
848-
const float max_logit = std::max(logit_yes, logit_no);
849-
const float exp_yes = std::exp(logit_yes - max_logit);
850-
const float exp_no = std::exp(logit_no - max_logit);
851-
confidence = exp_yes / (exp_yes + exp_no);
852-
LOGI("probe_confidence: yes_token=%d, no_token=%d, logit_yes=%.4f, logit_no=%.4f, confidence=%.4f",
853-
yes_token, no_token, logit_yes, logit_no, confidence);
854-
} else {
855-
LOGE("probe_confidence: could not find Yes/No tokens (yes=%d, no=%d)", yes_token, no_token);
856-
}
857-
858-
if (mem) {
859-
llama_memory_seq_rm(mem, 0, probe_start_pos, -1);
860-
LOGI("probe_confidence: removed probe tokens from KV cache (pos %d onwards)", probe_start_pos);
861-
}
862-
863-
return confidence;
864-
}
865765

866766
bool LlamaCppTextGeneration::inject_system_prompt(const std::string& prompt) {
867767
std::lock_guard<std::mutex> lock(mutex_);

sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -135,20 +135,6 @@ class LlamaCppTextGeneration {
135135
int* out_prompt_tokens);
136136
void cancel();
137137

138-
/**
139-
* @brief Check whether the given context answers the query, using logit probing.
140-
*
141-
* Formats a Yes/No question, runs llama_decode for prefill only (no generation),
142-
* extracts logits for the "Yes" and "No" tokens at the last position, and computes
143-
* confidence via softmax. Probe tokens are removed from the KV cache before returning.
144-
*
145-
* @param context The context passage (retrieved sentence or accumulated sentences)
146-
* @param query The user query to check against the context
147-
* @return Confidence score in [0.0, 1.0] — higher means context likely answers query.
148-
* Returns 0.5 on error (neutral / unknown).
149-
*/
150-
float probe_confidence(const std::string& context, const std::string& query);
151-
152138
/**
153139
* @brief Inject a system prompt into the KV cache at position 0.
154140
* Clears existing KV cache first, then decodes the prompt tokens.

sdk/runanywhere-commons/src/backends/llamacpp/rac_backend_llamacpp_register.cpp

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -135,10 +135,6 @@ static rac_result_t llamacpp_vtable_append_context(void* impl, const char* text)
135135
return rac_llm_llamacpp_append_context(impl, text);
136136
}
137137

138-
static rac_result_t llamacpp_vtable_probe_confidence(void* impl, const char* context,
139-
const char* query, float* out_confidence) {
140-
return rac_llm_llamacpp_probe_confidence(impl, context, query, out_confidence);
141-
}
142138

143139
static rac_result_t llamacpp_vtable_generate_from_context(void* impl, const char* query,
144140
const rac_llm_options_t* options,
@@ -165,8 +161,7 @@ static const rac_llm_service_ops_t g_llamacpp_ops = {
165161
.get_lora_info = llamacpp_vtable_get_lora_info,
166162
.inject_system_prompt = llamacpp_vtable_inject_system_prompt,
167163
.append_context = llamacpp_vtable_append_context,
168-
.probe_confidence = llamacpp_vtable_probe_confidence,
169-
.generate_from_context = llamacpp_vtable_generate_from_context,
164+
.generate_from_context = llamacpp_vtable_generate_from_context,
170165
.clear_context = llamacpp_vtable_clear_context,
171166
};
172167

sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp

Lines changed: 1 addition & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -442,26 +442,6 @@ rac_result_t rac_llm_llamacpp_append_context(rac_handle_t handle, const char* te
442442
}
443443
}
444444

445-
rac_result_t rac_llm_llamacpp_probe_confidence(rac_handle_t handle, const char* context,
446-
const char* query, float* out_confidence) {
447-
if (handle == nullptr || query == nullptr || out_confidence == nullptr) {
448-
return RAC_ERROR_NULL_POINTER;
449-
}
450-
451-
auto* h = static_cast<rac_llm_llamacpp_handle_impl*>(handle);
452-
if (!h->text_gen) {
453-
return RAC_ERROR_INVALID_HANDLE;
454-
}
455-
456-
try {
457-
*out_confidence = h->text_gen->probe_confidence(context ? context : "", query);
458-
return RAC_SUCCESS;
459-
} catch (const std::exception& e) {
460-
rac_error_set_details(e.what());
461-
*out_confidence = 0.5f;
462-
return RAC_ERROR_INFERENCE_FAILED;
463-
}
464-
}
465445

466446
rac_result_t rac_llm_llamacpp_generate_from_context(rac_handle_t handle, const char* query,
467447
const rac_llm_options_t* options,
@@ -475,7 +455,7 @@ rac_result_t rac_llm_llamacpp_generate_from_context(rac_handle_t handle, const c
475455
return RAC_ERROR_INVALID_HANDLE;
476456
}
477457

478-
TextGenerationRequest request;
458+
runanywhere::TextGenerationRequest request;
479459
request.prompt = query;
480460
if (options != nullptr) {
481461
request.max_tokens = options->max_tokens;

sdk/runanywhere-commons/src/features/llm/rac_llm_service.cpp

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -243,17 +243,6 @@ rac_result_t rac_llm_append_context(rac_handle_t handle, const char* text) {
243243
return service->ops->append_context(service->impl, text);
244244
}
245245

246-
rac_result_t rac_llm_probe_confidence(rac_handle_t handle, const char* context,
247-
const char* query, float* out_confidence) {
248-
if (!handle || !query || !out_confidence)
249-
return RAC_ERROR_NULL_POINTER;
250-
251-
auto* service = static_cast<rac_llm_service_t*>(handle);
252-
if (!service->ops || !service->ops->probe_confidence)
253-
return RAC_ERROR_NOT_SUPPORTED;
254-
255-
return service->ops->probe_confidence(service->impl, context, query, out_confidence);
256-
}
257246

258247
rac_result_t rac_llm_generate_from_context(rac_handle_t handle, const char* query,
259248
const rac_llm_options_t* options,

sdk/runanywhere-commons/src/features/rag/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ set(RAG_PIPELINE_SOURCES
5757
rag_backend.cpp
5858
vector_store_usearch.cpp
5959
rag_chunker.cpp
60+
bm25_index.cpp
6061
rac_rag_register.cpp
6162
rac_rag_pipeline.cpp
6263
)
@@ -73,6 +74,7 @@ set(RAG_PIPELINE_HEADERS
7374
rag_backend.h
7475
vector_store_usearch.h
7576
rag_chunker.h
77+
bm25_index.h
7678
)
7779

7880
if(RAC_BUILD_SHARED)

0 commit comments

Comments
 (0)