Skip to content

Commit 9e4f2df

Browse files
Merge pull request #428 from VyasGuru/RAG-OPTIS
Optimised RAG Prototype
2 parents bc33fef + aa7236c commit 9e4f2df

12 files changed

Lines changed: 1174 additions & 67 deletions

sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp

Lines changed: 405 additions & 0 deletions
Large diffs are not rendered by default.

sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,47 @@ class LlamaCppTextGeneration {
134134
bool generate_stream(const TextGenerationRequest& request, TextStreamCallback callback,
135135
int* out_prompt_tokens);
136136
void cancel();
137+
138+
/**
139+
* @brief Check whether the given context answers the query, using logit probing.
140+
*
141+
* Formats a Yes/No question, runs llama_decode for prefill only (no generation),
142+
* extracts logits for the "Yes" and "No" tokens at the last position, and computes
143+
* confidence via softmax. Probe tokens are removed from the KV cache before returning.
144+
*
145+
* @param context The context passage (retrieved sentence or accumulated sentences)
146+
* @param query The user query to check against the context
147+
* @return Confidence score in [0.0, 1.0] — higher means context likely answers query.
148+
* Returns 0.5 on error (neutral / unknown).
149+
*/
150+
float probe_confidence(const std::string& context, const std::string& query);
151+
152+
/**
153+
* @brief Inject a system prompt into the KV cache at position 0.
154+
* Clears existing KV cache first, then decodes the prompt tokens.
155+
* @return true on success, false on error.
156+
*/
157+
bool inject_system_prompt(const std::string& prompt);
158+
159+
/**
160+
* @brief Append text to the KV cache after current content.
161+
* Does not clear existing KV cache — adds at current position.
162+
* @return true on success, false on error.
163+
*/
164+
bool append_context(const std::string& text);
165+
166+
/**
167+
* @brief Generate a response from accumulated KV cache state.
168+
* Unlike generate(), does NOT clear the KV cache first.
169+
* @return TextGenerationResult with generated text.
170+
*/
171+
TextGenerationResult generate_from_context(const TextGenerationRequest& request);
172+
173+
/**
174+
* @brief Clear all KV cache state.
175+
*/
176+
void clear_context();
177+
137178
nlohmann::json get_model_info() const;
138179

139180
// LoRA adapter management

sdk/runanywhere-commons/src/backends/rag/inference_provider.h

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,10 +134,47 @@ class ITextGenerator {
134134

135135
/**
136136
* @brief Get maximum context size in tokens
137-
*
137+
*
138138
* @return Context window size
139139
*/
140140
virtual int context_size() const noexcept = 0;
141+
142+
/**
143+
* @brief Inject a system prompt into the KV cache at position 0.
144+
* Called once at the start of an adaptive query loop.
145+
* Default: no-op (returns false).
146+
*/
147+
virtual bool inject_system_prompt(const std::string& prompt) { (void)prompt; return false; }
148+
149+
/**
150+
* @brief Append text to the KV cache after current content.
151+
* Used to incrementally add sentences during the adaptive loop.
152+
* Default: no-op (returns false).
153+
*/
154+
virtual bool append_context(const std::string& text) { (void)text; return false; }
155+
156+
/**
157+
* @brief Check confidence that accumulated context answers the query.
158+
* Default: returns 0.5 (neutral — loop continues).
159+
*/
160+
virtual float probe_confidence(const std::string& context, const std::string& query) {
161+
(void)context; (void)query; return 0.5f;
162+
}
163+
164+
/**
165+
* @brief Generate response using accumulated KV cache state.
166+
* Unlike generate(), does NOT clear the KV cache first.
167+
* Default: falls back to generate(prompt, options).
168+
*/
169+
virtual GenerationResult generate_from_context(const std::string& query, const GenerationOptions& options = GenerationOptions{}) {
170+
return generate(query, options);
171+
}
172+
173+
/**
174+
* @brief Clear all KV cache state.
175+
* Default: no-op.
176+
*/
177+
virtual void clear_context() {}
141178
};
142179

143180
// =============================================================================

0 commit comments

Comments
 (0)