From 446d5929a5196e038859c6a2afbbb7a94831f34e Mon Sep 17 00:00:00 2001 From: Sanchit Monga Date: Mon, 13 Apr 2026 23:57:18 -0700 Subject: [PATCH] feat: add grammar-constrained structured output across SDKs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces GBNF grammar-constrained decoding for guaranteed valid JSON output matching a developer schema. Implementation layered through the C++ core, with bindings exposed in all platform SDKs. - Commons (C++): llamacpp grammar sampler at head of sampler chain, JSON Schema → GBNF converter, new component-level structured generate/stream APIs, vtable op json_schema_to_grammar, updated platform backend vtable with explicit NULL entries - JNI: wire grammar field through commons + llamacpp bridges - Swift: StructuredOutputFallback, extended StructuredOutputConfig, generate via rac_llm_component_generate_structured - Web: WASM offset helpers for new fields, TypeScript types updated, StructuredOutputFallback exported - Kotlin: StructuredOutputFallback enum, extended config, LlamaCPPBridge JNI declarations for direct LLM ops + schema-to-grammar Co-Authored-By: Claude Opus 4.6 (1M context) --- .../exports/RACommons.exports | 7 ++ .../include/rac/backends/rac_llm_llamacpp.h | 20 ++++ .../rac/features/llm/rac_llm_component.h | 45 +++++++ .../rac/features/llm/rac_llm_service.h | 21 ++++ .../include/rac/features/llm/rac_llm_types.h | 33 +++++- .../llamacpp/jni/rac_backend_llamacpp_jni.cpp | 52 +++++++- .../backends/llamacpp/llamacpp_backend.cpp | 61 +++++++++- .../src/backends/llamacpp/llamacpp_backend.h | 11 ++ .../rac_backend_llamacpp_register.cpp | 7 ++ .../backends/llamacpp/rac_llm_llamacpp.cpp | 53 ++++++++- .../src/features/llm/llm_component.cpp | 111 ++++++++++++++++++ .../src/features/llm/rac_llm_service.cpp | 16 +++ .../rac_backend_platform_register.cpp | 9 ++ .../src/jni/runanywhere_commons_jni.cpp | 15 +++ .../sdk/llm/llamacpp/LlamaCPPBridge.kt | 57 +++++++++ .../sdk/public/extensions/LLM/LLMTypes.kt | 22 ++++ .../CRACommons/include/rac_llm_component.h | 45 +++++++ .../CRACommons/include/rac_llm_service.h | 21 ++++ .../CRACommons/include/rac_llm_types.h | 33 +++++- .../Public/Extensions/LLM/LLMTypes.swift | 27 ++++- .../LLM/RunAnywhere+StructuredOutput.swift | 46 +++++++- .../core/src/Foundation/StructOffsets.ts | 4 +- .../RunAnywhere+StructuredOutput.ts | 22 ++++ .../src/Foundation/LlamaCppOffsets.ts | 4 + .../packages/llamacpp/src/index.ts | 2 +- sdk/runanywhere-web/wasm/src/wasm_exports.cpp | 12 ++ 26 files changed, 734 insertions(+), 22 deletions(-) diff --git a/sdk/runanywhere-commons/exports/RACommons.exports b/sdk/runanywhere-commons/exports/RACommons.exports index 5ea2cf5c8..ee55ce48a 100644 --- a/sdk/runanywhere-commons/exports/RACommons.exports +++ b/sdk/runanywhere-commons/exports/RACommons.exports @@ -226,6 +226,13 @@ _rac_llm_component_load_model _rac_llm_component_supports_streaming _rac_llm_component_unload +# LLM Component - Structured Output +_rac_llm_component_generate_structured +_rac_llm_component_generate_structured_stream + +# LLM Service - Grammar +_rac_llm_json_schema_to_grammar + # LLM Component - LoRA _rac_llm_component_load_lora _rac_llm_component_remove_lora diff --git a/sdk/runanywhere-commons/include/rac/backends/rac_llm_llamacpp.h b/sdk/runanywhere-commons/include/rac/backends/rac_llm_llamacpp.h index b98e67d1b..c78105d01 100644 --- a/sdk/runanywhere-commons/include/rac/backends/rac_llm_llamacpp.h +++ b/sdk/runanywhere-commons/include/rac/backends/rac_llm_llamacpp.h @@ -288,6 +288,26 @@ RAC_LLAMACPP_API rac_result_t rac_llm_llamacpp_generate_from_context( */ RAC_LLAMACPP_API rac_result_t rac_llm_llamacpp_clear_context(rac_handle_t handle); +// ============================================================================= +// JSON SCHEMA → GBNF GRAMMAR CONVERSION +// ============================================================================= + +/** + * Convert a JSON Schema string to a GBNF grammar string for constrained decoding. + * + * Uses llama.cpp's built-in json-schema-to-grammar converter. The resulting + * grammar can be passed via rac_llm_options_t.grammar for grammar-constrained + * token generation. + * + * @param handle Service handle (from rac_llm_llamacpp_create) + * @param json_schema JSON Schema string + * @param out_grammar Output: GBNF grammar string (caller must free with rac_free) + * @return RAC_SUCCESS or error code + */ +RAC_LLAMACPP_API rac_result_t rac_llm_llamacpp_json_schema_to_grammar(rac_handle_t handle, + const char* json_schema, + char** out_grammar); + // ============================================================================= // BACKEND REGISTRATION // ============================================================================= diff --git a/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_component.h b/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_component.h index bae475b6e..f80918af8 100644 --- a/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_component.h +++ b/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_component.h @@ -166,6 +166,51 @@ RAC_API rac_result_t rac_llm_component_generate(rac_handle_t handle, const char* const rac_llm_options_t* options, rac_llm_result_t* out_result); +// ============================================================================= +// STRUCTURED OUTPUT - Grammar-constrained generation +// ============================================================================= + +/** + * @brief Generate structured output with grammar-constrained decoding + * + * Converts JSON schema to GBNF grammar, applies grammar constraint during + * token generation so the LLM can only produce valid JSON matching the schema. + * Falls back to prompt-only mode if grammar conversion is not supported. + * + * @param handle Component handle + * @param prompt Input prompt + * @param options Generation options (can be NULL for defaults) + * @param so_config Structured output config with JSON schema and fallback settings + * @param out_result Output: Generation result (text will be valid JSON) + * @return RAC_SUCCESS or error code + */ +RAC_API rac_result_t rac_llm_component_generate_structured( + rac_handle_t handle, const char* prompt, const rac_llm_options_t* options, + const rac_structured_output_config_t* so_config, rac_llm_result_t* out_result); + +/** + * @brief Generate structured output with streaming and grammar constraints + * + * Same as generate_structured but with token-by-token streaming callbacks. + * Each emitted token is guaranteed to conform to the grammar. + * + * @param handle Component handle + * @param prompt Input prompt + * @param options Generation options (can be NULL for defaults) + * @param so_config Structured output config with JSON schema + * @param token_callback Called for each generated token + * @param complete_callback Called when generation completes + * @param error_callback Called on error + * @param user_data User context passed to callbacks + * @return RAC_SUCCESS or error code + */ +RAC_API rac_result_t rac_llm_component_generate_structured_stream( + rac_handle_t handle, const char* prompt, const rac_llm_options_t* options, + const rac_structured_output_config_t* so_config, + rac_llm_component_token_callback_fn token_callback, + rac_llm_component_complete_callback_fn complete_callback, + rac_llm_component_error_callback_fn error_callback, void* user_data); + /** * @brief Check if streaming is supported * diff --git a/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_service.h b/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_service.h index 96bb2756c..30b0de82d 100644 --- a/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_service.h +++ b/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_service.h @@ -78,6 +78,12 @@ typedef struct rac_llm_service_ops { /** Clear all KV cache state (optional, NULL if not supported) */ rac_result_t (*clear_context)(void* impl); + + /** + * Convert JSON Schema to GBNF grammar string (optional, NULL if not supported). + * Caller must free out_grammar with rac_free(). + */ + rac_result_t (*json_schema_to_grammar)(void* impl, const char* json_schema, char** out_grammar); } rac_llm_service_ops_t; /** @@ -185,6 +191,21 @@ RAC_API void rac_llm_destroy(rac_handle_t handle); */ RAC_API void rac_llm_result_free(rac_llm_result_t* result); +/** + * @brief Convert JSON Schema to GBNF grammar string + * + * Routes through service registry to the backend's json_schema_to_grammar op. + * The resulting GBNF grammar can be passed in rac_llm_options_t.grammar + * for grammar-constrained decoding. + * + * @param handle Service handle + * @param json_schema JSON Schema string + * @param out_grammar Output: GBNF grammar string (caller must free with rac_free) + * @return RAC_SUCCESS or RAC_ERROR_NOT_SUPPORTED if backend doesn't support grammar + */ +RAC_API rac_result_t rac_llm_json_schema_to_grammar(rac_handle_t handle, const char* json_schema, + char** out_grammar); + // ============================================================================= // ADAPTIVE CONTEXT API - For RAG and similar pipelines // ============================================================================= diff --git a/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_types.h b/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_types.h index e3755f851..19a8b17c8 100644 --- a/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_types.h +++ b/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_types.h @@ -93,6 +93,9 @@ typedef struct rac_llm_options { /** System prompt (can be NULL) */ const char* system_prompt; + + /** GBNF grammar string for constrained decoding (can be NULL for unconstrained) */ + const char* grammar; } rac_llm_options_t; /** @@ -104,7 +107,8 @@ static const rac_llm_options_t RAC_LLM_OPTIONS_DEFAULT = {.max_tokens = 100, .stop_sequences = RAC_NULL, .num_stop_sequences = 0, .streaming_enabled = RAC_FALSE, - .system_prompt = RAC_NULL}; + .system_prompt = RAC_NULL, + .grammar = RAC_NULL}; // ============================================================================= // RESULT - Mirrors Swift's LLMGenerationResult @@ -209,6 +213,18 @@ static const rac_thinking_tag_pattern_t RAC_THINKING_TAG_FULL = {.opening_tag = // STRUCTURED OUTPUT - Mirrors Swift's StructuredOutputConfig // ============================================================================= +/** + * @brief Fallback strategy when grammar-constrained structured output fails + */ +typedef enum rac_structured_output_fallback { + /** Return raw text output (no parsing attempt) */ + RAC_STRUCTURED_OUTPUT_FALLBACK_RAW = 0, + /** Retry generation with grammar constraint (default) */ + RAC_STRUCTURED_OUTPUT_FALLBACK_RETRY = 1, + /** Fall back to prompt-only mode (no grammar constraint) */ + RAC_STRUCTURED_OUTPUT_FALLBACK_PROMPT_ONLY = 2 +} rac_structured_output_fallback_t; + /** * @brief Structured output configuration * @@ -223,13 +239,26 @@ typedef struct rac_structured_output_config { /** Whether to include the schema in the prompt */ rac_bool_t include_schema_in_prompt; + + /** Enable GBNF grammar-constrained decoding (default: true when json_schema is set) */ + rac_bool_t use_grammar; + + /** Maximum retry attempts on failure (default: 3) */ + int32_t max_retries; + + /** Fallback strategy on failure (default: RETRY) */ + rac_structured_output_fallback_t fallback; } rac_structured_output_config_t; /** * @brief Default structured output configuration */ static const rac_structured_output_config_t RAC_STRUCTURED_OUTPUT_DEFAULT = { - .json_schema = RAC_NULL, .include_schema_in_prompt = RAC_TRUE}; + .json_schema = RAC_NULL, + .include_schema_in_prompt = RAC_TRUE, + .use_grammar = RAC_TRUE, + .max_retries = 3, + .fallback = RAC_STRUCTURED_OUTPUT_FALLBACK_RETRY}; /** * @brief Structured output validation result diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/jni/rac_backend_llamacpp_jni.cpp b/sdk/runanywhere-commons/src/backends/llamacpp/jni/rac_backend_llamacpp_jni.cpp index 56626c4aa..3561a95d4 100644 --- a/sdk/runanywhere-commons/src/backends/llamacpp/jni/rac_backend_llamacpp_jni.cpp +++ b/sdk/runanywhere-commons/src/backends/llamacpp/jni/rac_backend_llamacpp_jni.cpp @@ -219,7 +219,8 @@ Java_com_runanywhere_sdk_llm_llamacpp_LlamaCPPBridge_nativeDestroy( JNIEXPORT jstring JNICALL Java_com_runanywhere_sdk_llm_llamacpp_LlamaCPPBridge_nativeGenerate( JNIEnv* env, jclass clazz, - jlong handle, jstring prompt, jint maxTokens, jfloat temperature) { + jlong handle, jstring prompt, jint maxTokens, jfloat temperature, + jstring grammar) { (void)clazz; if (handle == 0) { @@ -240,12 +241,24 @@ Java_com_runanywhere_sdk_llm_llamacpp_LlamaCPPBridge_nativeGenerate( options.max_tokens = maxTokens; options.temperature = temperature; + // Wire grammar field for constrained decoding + const char* grammarStr = nullptr; + if (grammar != nullptr) { + grammarStr = env->GetStringUTFChars(grammar, nullptr); + if (grammarStr && grammarStr[0] != '\0') { + options.grammar = grammarStr; + } + } + rac_llm_result_t result = {}; rac_result_t status = rac_llm_llamacpp_generate( reinterpret_cast(handle), promptStr, &options, &result); env->ReleaseStringUTFChars(prompt, promptStr); + if (grammarStr) { + env->ReleaseStringUTFChars(grammar, grammarStr); + } if (status != RAC_SUCCESS) { LOGe("nativeGenerate: Failed with status %d", status); @@ -263,6 +276,43 @@ Java_com_runanywhere_sdk_llm_llamacpp_LlamaCPPBridge_nativeGenerate( return output; } +/** + * Convert JSON Schema to GBNF grammar string + */ +JNIEXPORT jstring JNICALL +Java_com_runanywhere_sdk_llm_llamacpp_LlamaCPPBridge_nativeJsonSchemaToGrammar( + JNIEnv* env, jclass clazz, + jlong handle, jstring jsonSchema) { + (void)clazz; + + if (handle == 0) { + LOGe("nativeJsonSchemaToGrammar: Invalid handle"); + return nullptr; + } + + const char* schemaStr = env->GetStringUTFChars(jsonSchema, nullptr); + if (!schemaStr) { + LOGe("nativeJsonSchemaToGrammar: Failed to get schema"); + return nullptr; + } + + char* grammarOut = nullptr; + rac_result_t status = rac_llm_llamacpp_json_schema_to_grammar( + reinterpret_cast(handle), + schemaStr, &grammarOut); + + env->ReleaseStringUTFChars(jsonSchema, schemaStr); + + if (status != RAC_SUCCESS || !grammarOut) { + LOGe("nativeJsonSchemaToGrammar: Failed with status %d", status); + return nullptr; + } + + jstring result = env->NewStringUTF(grammarOut); + free(grammarOut); + return result; +} + /** * Cancel ongoing generation */ diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp index 646503868..6dde8e892 100644 --- a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp +++ b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp @@ -1,6 +1,7 @@ #include "llamacpp_backend.h" #include "common.h" +#include "json-schema-to-grammar.h" #include #include @@ -667,7 +668,8 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques cached_temperature_ == request.temperature && cached_top_p_ == request.top_p && cached_top_k_ == request.top_k && - cached_repetition_penalty_ == request.repetition_penalty; + cached_repetition_penalty_ == request.repetition_penalty && + cached_grammar_ == request.grammar; if (!params_match) { if (sampler_) { @@ -678,6 +680,22 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques sparams.no_perf = true; sampler_ = llama_sampler_chain_init(sparams); + // Grammar sampler goes first — masks invalid tokens before other samplers run + if (!request.grammar.empty()) { + const auto* const grammar_vocab = llama_model_get_vocab(model_); + auto* grammar_sampler = + llama_sampler_init_grammar(grammar_vocab, request.grammar.c_str(), "root"); + if (grammar_sampler) { + llama_sampler_chain_add(sampler_, grammar_sampler); + RAC_LOG_INFO("LLM.LlamaCpp", + "Grammar-constrained decoding enabled (GBNF length=%zu)", + request.grammar.size()); + } else { + RAC_LOG_WARNING("LLM.LlamaCpp", + "Failed to parse GBNF grammar, proceeding without constraint"); + } + } + if (request.temperature > 0.0f) { llama_sampler_chain_add(sampler_, llama_sampler_init_penalties(kRepeatPenaltyWindow, request.repetition_penalty, 0.0f, 0.0f)); @@ -697,16 +715,18 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques cached_top_p_ = request.top_p; cached_top_k_ = request.top_k; cached_repetition_penalty_ = request.repetition_penalty; + cached_grammar_ = request.grammar; } } // Log generation parameters RAC_LOG_INFO("LLM.LlamaCpp","[PARAMS] LLM generate_stream (per-request options): temperature=%.4f, top_p=%.4f, top_k=%d, " "max_tokens=%d (effective=%d), repetition_penalty=%.4f, " - "system_prompt_len=%zu", + "system_prompt_len=%zu, grammar=%s", request.temperature, request.top_p, request.top_k, request.max_tokens, effective_max_tokens, request.repetition_penalty, - request.system_prompt.length()); + request.system_prompt.length(), + request.grammar.empty() ? "none" : "enabled"); const auto* const vocab = llama_model_get_vocab(model_); @@ -992,11 +1012,21 @@ TextGenerationResult LlamaCppTextGeneration::generate_from_context(const TextGen } llama_sampler* sampler = nullptr; + const auto vocab = llama_model_get_vocab(model_); { auto sparams = llama_sampler_chain_default_params(); sparams.no_perf = true; sampler = llama_sampler_chain_init(sparams); + // Grammar sampler first (masks invalid tokens) + if (!request.grammar.empty()) { + auto* grammar_sampler = + llama_sampler_init_grammar(vocab, request.grammar.c_str(), "root"); + if (grammar_sampler) { + llama_sampler_chain_add(sampler, grammar_sampler); + } + } + if (request.temperature > 0.0f) { llama_sampler_chain_add(sampler, llama_sampler_init_penalties(64, request.repetition_penalty, 0.0f, 0.0f)); @@ -1011,8 +1041,6 @@ TextGenerationResult LlamaCppTextGeneration::generate_from_context(const TextGen } } - const auto vocab = llama_model_get_vocab(model_); - static const std::vector STOP_SEQUENCES = { "<|im_end|>", "<|eot_id|>", "", "<|end|>", "<|endoftext|>", "\n\nUser:", "\n\nHuman:", @@ -1220,6 +1248,7 @@ bool LlamaCppTextGeneration::recreate_context() { cached_top_p_ = -1.0f; cached_top_k_ = -1; cached_repetition_penalty_ = -1.0f; + cached_grammar_.clear(); RAC_LOG_INFO("LLM.LlamaCpp","Context recreated successfully"); return true; @@ -1372,4 +1401,26 @@ nlohmann::json LlamaCppTextGeneration::get_lora_info() const { return adapters; } +// ============================================================================= +// JSON SCHEMA → GBNF GRAMMAR CONVERSION +// ============================================================================= + +std::string LlamaCppTextGeneration::convert_json_schema_to_grammar(const std::string& json_schema) { + if (json_schema.empty()) { + LOGW("convert_json_schema_to_grammar: empty schema"); + return ""; + } + + try { + auto schema = nlohmann::ordered_json::parse(json_schema); + std::string grammar = json_schema_to_grammar(schema); + LOGI("Converted JSON schema to GBNF grammar (schema=%zu chars, grammar=%zu chars)", + json_schema.size(), grammar.size()); + return grammar; + } catch (const std::exception& e) { + LOGW("Failed to convert JSON schema to GBNF: %s", e.what()); + return ""; + } +} + } // namespace runanywhere diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h index 3fa5bc1a1..bff4ab957 100644 --- a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h +++ b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h @@ -46,6 +46,9 @@ struct TextGenerationRequest { int top_k = 40; float repetition_penalty = 1.1f; std::vector stop_sequences; + + /** GBNF grammar string for constrained decoding (empty = no constraint) */ + std::string grammar; }; struct TextGenerationResult { @@ -168,6 +171,13 @@ class LlamaCppTextGeneration { nlohmann::json get_model_info() const; + /** + * @brief Convert JSON Schema to GBNF grammar string + * @param json_schema JSON Schema as string + * @return GBNF grammar string, or empty on error + */ + std::string convert_json_schema_to_grammar(const std::string& json_schema); + // LoRA adapter management bool load_lora_adapter(const std::string& adapter_path, float scale); bool remove_lora_adapter(const std::string& adapter_path); @@ -192,6 +202,7 @@ class LlamaCppTextGeneration { float cached_top_p_ = -1.0f; int cached_top_k_ = -1; float cached_repetition_penalty_ = -1.0f; + std::string cached_grammar_; bool model_loaded_ = false; std::atomic cancel_requested_{false}; diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/rac_backend_llamacpp_register.cpp b/sdk/runanywhere-commons/src/backends/llamacpp/rac_backend_llamacpp_register.cpp index 0d2b17033..570e79582 100644 --- a/sdk/runanywhere-commons/src/backends/llamacpp/rac_backend_llamacpp_register.cpp +++ b/sdk/runanywhere-commons/src/backends/llamacpp/rac_backend_llamacpp_register.cpp @@ -146,6 +146,12 @@ static rac_result_t llamacpp_vtable_clear_context(void* impl) { return rac_llm_llamacpp_clear_context(impl); } +// JSON Schema → GBNF grammar conversion +static rac_result_t llamacpp_vtable_json_schema_to_grammar(void* impl, const char* json_schema, + char** out_grammar) { + return rac_llm_llamacpp_json_schema_to_grammar(impl, json_schema, out_grammar); +} + // Static vtable for LlamaCpp static const rac_llm_service_ops_t g_llamacpp_ops = { .initialize = llamacpp_vtable_initialize, @@ -163,6 +169,7 @@ static const rac_llm_service_ops_t g_llamacpp_ops = { .append_context = llamacpp_vtable_append_context, .generate_from_context = llamacpp_vtable_generate_from_context, .clear_context = llamacpp_vtable_clear_context, + .json_schema_to_grammar = llamacpp_vtable_json_schema_to_grammar, }; // ============================================================================= diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp b/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp index 8ff7a45eb..1424e395e 100644 --- a/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp +++ b/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp @@ -174,10 +174,15 @@ rac_result_t rac_llm_llamacpp_generate(rac_handle_t handle, const char* prompt, } } } + // Wire grammar field for constrained decoding + if (options->grammar != nullptr) { + request.grammar = options->grammar; + } RAC_LOG_INFO("LLM.LlamaCpp.C-API","[PARAMS] LLM C-API (from caller options): max_tokens=%d, temperature=%.4f, " - "top_p=%.4f, system_prompt=%s", + "top_p=%.4f, system_prompt=%s, grammar=%s", request.max_tokens, request.temperature, request.top_p, - request.system_prompt.empty() ? "(none)" : "(set)"); + request.system_prompt.empty() ? "(none)" : "(set)", + request.grammar.empty() ? "(none)" : "(set)"); } else { RAC_LOG_INFO("LLM.LlamaCpp.C-API","[PARAMS] LLM C-API (using struct defaults): max_tokens=%d, temperature=%.4f, " "top_p=%.4f, system_prompt=(none)", @@ -264,10 +269,15 @@ rac_result_t rac_llm_llamacpp_generate_stream(rac_handle_t handle, const char* p } } } + // Wire grammar field for constrained decoding + if (options->grammar != nullptr) { + request.grammar = options->grammar; + } RAC_LOG_INFO("LLM.LlamaCpp.C-API","[PARAMS] LLM C-API (from caller options): max_tokens=%d, temperature=%.4f, " - "top_p=%.4f, system_prompt=%s", + "top_p=%.4f, system_prompt=%s, grammar=%s", request.max_tokens, request.temperature, request.top_p, - request.system_prompt.empty() ? "(none)" : "(set)"); + request.system_prompt.empty() ? "(none)" : "(set)", + request.grammar.empty() ? "(none)" : "(set)"); } else { RAC_LOG_INFO("LLM.LlamaCpp.C-API","[PARAMS] LLM C-API (using struct defaults): max_tokens=%d, temperature=%.4f, " "top_p=%.4f, system_prompt=(none)", @@ -479,6 +489,10 @@ rac_result_t rac_llm_llamacpp_generate_from_context(rac_handle_t handle, const c } } } + // Wire grammar field for constrained decoding + if (options->grammar != nullptr) { + request.grammar = options->grammar; + } } try { @@ -522,6 +536,37 @@ rac_result_t rac_llm_llamacpp_clear_context(rac_handle_t handle) { return RAC_SUCCESS; } +// ============================================================================= +// JSON SCHEMA → GBNF GRAMMAR CONVERSION +// ============================================================================= + +rac_result_t rac_llm_llamacpp_json_schema_to_grammar(rac_handle_t handle, + const char* json_schema, + char** out_grammar) { + if (handle == nullptr || json_schema == nullptr || out_grammar == nullptr) { + return RAC_ERROR_NULL_POINTER; + } + + auto* h = static_cast(handle); + if (!h->text_gen) { + return RAC_ERROR_INVALID_HANDLE; + } + + try { + std::string grammar = h->text_gen->convert_json_schema_to_grammar(json_schema); + if (grammar.empty()) { + rac_error_set_details("Failed to convert JSON schema to GBNF grammar"); + return RAC_ERROR_INVALID_ARGUMENT; + } + + *out_grammar = strdup(grammar.c_str()); + return RAC_SUCCESS; + } catch (const std::exception& e) { + rac_error_set_details(e.what()); + return RAC_ERROR_INFERENCE_FAILED; + } +} + void rac_llm_llamacpp_destroy(rac_handle_t handle) { if (handle == nullptr) { return; diff --git a/sdk/runanywhere-commons/src/features/llm/llm_component.cpp b/sdk/runanywhere-commons/src/features/llm/llm_component.cpp index 1c8a4da4d..10291bd8f 100644 --- a/sdk/runanywhere-commons/src/features/llm/llm_component.cpp +++ b/sdk/runanywhere-commons/src/features/llm/llm_component.cpp @@ -770,6 +770,117 @@ extern "C" rac_result_t rac_llm_component_generate_stream( return RAC_SUCCESS; } +// ============================================================================= +// STRUCTURED OUTPUT API +// ============================================================================= + +extern "C" rac_result_t rac_llm_component_generate_structured( + rac_handle_t handle, const char* prompt, const rac_llm_options_t* options, + const rac_structured_output_config_t* so_config, rac_llm_result_t* out_result) { + if (!handle) + return RAC_ERROR_INVALID_HANDLE; + if (!prompt || !so_config || !out_result) + return RAC_ERROR_INVALID_ARGUMENT; + + auto* component = reinterpret_cast(handle); + std::lock_guard lock(component->mtx); + + // Get service from lifecycle manager + rac_handle_t service = nullptr; + rac_result_t result = rac_lifecycle_require_service(component->lifecycle, &service); + if (result != RAC_SUCCESS) { + log_error("LLM.Component", "No model loaded - cannot generate structured"); + return result; + } + + // Use provided options or defaults + const rac_llm_options_t* base_options = options ? options : &component->default_options; + + // Build effective options with grammar if requested + rac_llm_options_t effective_options = *base_options; + + char* grammar_str = nullptr; + if (so_config->use_grammar && so_config->json_schema != nullptr) { + // Convert JSON schema to GBNF grammar via backend + result = rac_llm_json_schema_to_grammar(service, so_config->json_schema, &grammar_str); + if (result == RAC_SUCCESS && grammar_str) { + effective_options.grammar = grammar_str; + RAC_LOG_INFO("LLM.Component", "Grammar-constrained structured output enabled"); + } else { + RAC_LOG_WARNING("LLM.Component", + "Grammar conversion failed (result=%d), falling back to prompt-only", + result); + // Proceed without grammar — prompt-only fallback + } + } + + // Delegate to standard generate with grammar-augmented options + result = rac_llm_component_generate(handle, prompt, &effective_options, out_result); + + // Free grammar string if we allocated one + if (grammar_str) { + free(grammar_str); + } + + return result; +} + +extern "C" rac_result_t rac_llm_component_generate_structured_stream( + rac_handle_t handle, const char* prompt, const rac_llm_options_t* options, + const rac_structured_output_config_t* so_config, + rac_llm_component_token_callback_fn token_callback, + rac_llm_component_complete_callback_fn complete_callback, + rac_llm_component_error_callback_fn error_callback, void* user_data) { + if (!handle) + return RAC_ERROR_INVALID_HANDLE; + if (!prompt || !so_config) + return RAC_ERROR_INVALID_ARGUMENT; + + auto* component = reinterpret_cast(handle); + std::lock_guard lock(component->mtx); + + // Get service from lifecycle manager + rac_handle_t service = nullptr; + rac_result_t result = rac_lifecycle_require_service(component->lifecycle, &service); + if (result != RAC_SUCCESS) { + log_error("LLM.Component", "No model loaded - cannot generate structured stream"); + if (error_callback) { + error_callback(result, "No model loaded", user_data); + } + return result; + } + + // Use provided options or defaults + const rac_llm_options_t* base_options = options ? options : &component->default_options; + + // Build effective options with grammar if requested + rac_llm_options_t effective_options = *base_options; + + char* grammar_str = nullptr; + if (so_config->use_grammar && so_config->json_schema != nullptr) { + result = rac_llm_json_schema_to_grammar(service, so_config->json_schema, &grammar_str); + if (result == RAC_SUCCESS && grammar_str) { + effective_options.grammar = grammar_str; + RAC_LOG_INFO("LLM.Component", "Grammar-constrained structured streaming enabled"); + } else { + RAC_LOG_WARNING("LLM.Component", + "Grammar conversion failed (result=%d), falling back to prompt-only", + result); + } + } + + // Delegate to standard stream generate with grammar-augmented options + result = rac_llm_component_generate_stream(handle, prompt, &effective_options, token_callback, + complete_callback, error_callback, user_data); + + // Free grammar string if we allocated one + if (grammar_str) { + free(grammar_str); + } + + return result; +} + extern "C" rac_result_t rac_llm_component_cancel(rac_handle_t handle) { if (!handle) return RAC_ERROR_INVALID_HANDLE; diff --git a/sdk/runanywhere-commons/src/features/llm/rac_llm_service.cpp b/sdk/runanywhere-commons/src/features/llm/rac_llm_service.cpp index 6705f13a4..6bce63055 100644 --- a/sdk/runanywhere-commons/src/features/llm/rac_llm_service.cpp +++ b/sdk/runanywhere-commons/src/features/llm/rac_llm_service.cpp @@ -268,4 +268,20 @@ rac_result_t rac_llm_clear_context(rac_handle_t handle) { return service->ops->clear_context(service->impl); } +// ============================================================================= +// JSON SCHEMA → GBNF GRAMMAR - VTable dispatch +// ============================================================================= + +rac_result_t rac_llm_json_schema_to_grammar(rac_handle_t handle, const char* json_schema, + char** out_grammar) { + if (!handle || !json_schema || !out_grammar) + return RAC_ERROR_NULL_POINTER; + + auto* service = static_cast(handle); + if (!service->ops || !service->ops->json_schema_to_grammar) + return RAC_ERROR_NOT_SUPPORTED; + + return service->ops->json_schema_to_grammar(service->impl, json_schema, out_grammar); +} + } // extern "C" diff --git a/sdk/runanywhere-commons/src/features/platform/rac_backend_platform_register.cpp b/sdk/runanywhere-commons/src/features/platform/rac_backend_platform_register.cpp index cb93eae3f..46789606f 100644 --- a/sdk/runanywhere-commons/src/features/platform/rac_backend_platform_register.cpp +++ b/sdk/runanywhere-commons/src/features/platform/rac_backend_platform_register.cpp @@ -151,6 +151,15 @@ static const rac_llm_service_ops_t g_platform_llm_ops = { .cancel = platform_llm_vtable_cancel, .cleanup = platform_llm_vtable_cleanup, .destroy = platform_llm_vtable_destroy, + .load_lora = nullptr, + .remove_lora = nullptr, + .clear_lora = nullptr, + .get_lora_info = nullptr, + .inject_system_prompt = nullptr, + .append_context = nullptr, + .generate_from_context = nullptr, + .clear_context = nullptr, + .json_schema_to_grammar = nullptr, }; // ============================================================================= diff --git a/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp b/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp index 9e37f4dfe..45a5a7a93 100644 --- a/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp +++ b/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp @@ -552,6 +552,7 @@ Java_com_runanywhere_sdk_native_bridge_RunAnywhereBridge_racLlmComponentGenerate // Parse configJson if provided std::string sys_prompt_storage; + std::string grammar_storage; if (config != nullptr) { try { auto j = nlohmann::json::parse(config); @@ -562,6 +563,10 @@ Java_com_runanywhere_sdk_native_bridge_RunAnywhereBridge_racLlmComponentGenerate if (!sys_prompt_storage.empty()) { options.system_prompt = sys_prompt_storage.c_str(); } + grammar_storage = j.value("grammar", std::string("")); + if (!grammar_storage.empty()) { + options.grammar = grammar_storage.c_str(); + } } catch (const nlohmann::json::exception& e) { LOGe("Failed to parse LLM config JSON: %s", e.what()); } @@ -859,6 +864,7 @@ Java_com_runanywhere_sdk_native_bridge_RunAnywhereBridge_racLlmComponentGenerate // Parse configJson if provided std::string sys_prompt_storage; + std::string grammar_storage; if (config != nullptr) { try { auto j = nlohmann::json::parse(config); @@ -869,6 +875,10 @@ Java_com_runanywhere_sdk_native_bridge_RunAnywhereBridge_racLlmComponentGenerate if (!sys_prompt_storage.empty()) { options.system_prompt = sys_prompt_storage.c_str(); } + grammar_storage = j.value("grammar", std::string("")); + if (!grammar_storage.empty()) { + options.grammar = grammar_storage.c_str(); + } } catch (const nlohmann::json::exception& e) { LOGe("Failed to parse LLM config JSON: %s", e.what()); } @@ -994,6 +1004,7 @@ Java_com_runanywhere_sdk_native_bridge_RunAnywhereBridge_racLlmComponentGenerate // Parse configJson if provided std::string sys_prompt_storage; + std::string grammar_storage; if (config != nullptr) { try { auto j = nlohmann::json::parse(config); @@ -1004,6 +1015,10 @@ Java_com_runanywhere_sdk_native_bridge_RunAnywhereBridge_racLlmComponentGenerate if (!sys_prompt_storage.empty()) { options.system_prompt = sys_prompt_storage.c_str(); } + grammar_storage = j.value("grammar", std::string("")); + if (!grammar_storage.empty()) { + options.grammar = grammar_storage.c_str(); + } } catch (const nlohmann::json::exception& e) { LOGe("Failed to parse LLM config JSON: %s", e.what()); } diff --git a/sdk/runanywhere-kotlin/modules/runanywhere-core-llamacpp/src/jvmAndroidMain/kotlin/com/runanywhere/sdk/llm/llamacpp/LlamaCPPBridge.kt b/sdk/runanywhere-kotlin/modules/runanywhere-core-llamacpp/src/jvmAndroidMain/kotlin/com/runanywhere/sdk/llm/llamacpp/LlamaCPPBridge.kt index 8e9c1824a..97aad21a9 100644 --- a/sdk/runanywhere-kotlin/modules/runanywhere-core-llamacpp/src/jvmAndroidMain/kotlin/com/runanywhere/sdk/llm/llamacpp/LlamaCPPBridge.kt +++ b/sdk/runanywhere-kotlin/modules/runanywhere-core-llamacpp/src/jvmAndroidMain/kotlin/com/runanywhere/sdk/llm/llamacpp/LlamaCPPBridge.kt @@ -113,6 +113,63 @@ internal object LlamaCPPBridge { @JvmStatic external fun nativeGetVersion(): String + // ========================================================================== + // LLM Direct Operations + // ========================================================================== + + /** + * Create a LlamaCPP instance and load a model. + * + * @param modelPath Path to the GGUF model file + * @param contextSize Context window size + * @param numThreads Number of inference threads + * @param gpuLayers Number of layers to offload to GPU + * @return Native handle (0 on failure) + */ + @JvmStatic + external fun nativeCreate(modelPath: String, contextSize: Int, numThreads: Int, gpuLayers: Int): Long + + /** + * Destroy a LlamaCPP instance. + */ + @JvmStatic + external fun nativeDestroy(handle: Long) + + /** + * Generate text (blocking). + * + * @param handle Native handle + * @param prompt Input prompt + * @param maxTokens Max tokens to generate + * @param temperature Sampling temperature + * @param grammar GBNF grammar string for constrained decoding (null for unconstrained) + * @return Generated text or null on failure + */ + @JvmStatic + external fun nativeGenerate(handle: Long, prompt: String, maxTokens: Int, temperature: Float, grammar: String?): String? + + /** + * Convert a JSON Schema to a GBNF grammar string. + * + * @param handle Native handle + * @param jsonSchema JSON Schema string + * @return GBNF grammar string or null on failure + */ + @JvmStatic + external fun nativeJsonSchemaToGrammar(handle: Long, jsonSchema: String): String? + + /** + * Cancel ongoing generation. + */ + @JvmStatic + external fun nativeCancel(handle: Long) + + /** + * Get model info as JSON. + */ + @JvmStatic + external fun nativeGetModelInfo(handle: Long): String? + // ========================================================================== // VLM Registration JNI Methods // ========================================================================== diff --git a/sdk/runanywhere-kotlin/src/commonMain/kotlin/com/runanywhere/sdk/public/extensions/LLM/LLMTypes.kt b/sdk/runanywhere-kotlin/src/commonMain/kotlin/com/runanywhere/sdk/public/extensions/LLM/LLMTypes.kt index 053e0d155..a7f990852 100644 --- a/sdk/runanywhere-kotlin/src/commonMain/kotlin/com/runanywhere/sdk/public/extensions/LLM/LLMTypes.kt +++ b/sdk/runanywhere-kotlin/src/commonMain/kotlin/com/runanywhere/sdk/public/extensions/LLM/LLMTypes.kt @@ -239,6 +239,22 @@ interface Generatable { } } +/** + * Fallback strategy when grammar-constrained decoding fails. + * Mirrors Swift StructuredOutputFallback and C rac_structured_output_fallback. + */ +@Serializable +enum class StructuredOutputFallback(val value: Int) { + /** Return raw output without validation */ + RAW(0), + + /** Retry generation (default) */ + RETRY(1), + + /** Fall back to prompt-only mode (no grammar constraint) */ + PROMPT_ONLY(2), +} + /** * Structured output configuration. * Note: In Kotlin, we use KClass instead of Type. @@ -251,6 +267,12 @@ data class StructuredOutputConfig( val includeSchemaInPrompt: Boolean = true, /** JSON schema for the type */ val jsonSchema: String = Generatable.DEFAULT_JSON_SCHEMA, + /** Whether to use GBNF grammar-constrained decoding (default: true) */ + val useGrammar: Boolean = true, + /** Maximum retries for structured output parsing (default: 3) */ + val maxRetries: Int = 3, + /** Fallback strategy when grammar fails */ + val fallback: StructuredOutputFallback = StructuredOutputFallback.RETRY, ) /** diff --git a/sdk/runanywhere-swift/Sources/RunAnywhere/CRACommons/include/rac_llm_component.h b/sdk/runanywhere-swift/Sources/RunAnywhere/CRACommons/include/rac_llm_component.h index d2566b75c..91f17b7c4 100644 --- a/sdk/runanywhere-swift/Sources/RunAnywhere/CRACommons/include/rac_llm_component.h +++ b/sdk/runanywhere-swift/Sources/RunAnywhere/CRACommons/include/rac_llm_component.h @@ -166,6 +166,51 @@ RAC_API rac_result_t rac_llm_component_generate(rac_handle_t handle, const char* const rac_llm_options_t* options, rac_llm_result_t* out_result); +// ============================================================================= +// STRUCTURED OUTPUT - Grammar-constrained generation +// ============================================================================= + +/** + * @brief Generate structured output with grammar-constrained decoding + * + * Converts JSON schema to GBNF grammar, applies grammar constraint during + * token generation so the LLM can only produce valid JSON matching the schema. + * Falls back to prompt-only mode if grammar conversion is not supported. + * + * @param handle Component handle + * @param prompt Input prompt + * @param options Generation options (can be NULL for defaults) + * @param so_config Structured output config with JSON schema and fallback settings + * @param out_result Output: Generation result (text will be valid JSON) + * @return RAC_SUCCESS or error code + */ +RAC_API rac_result_t rac_llm_component_generate_structured( + rac_handle_t handle, const char* prompt, const rac_llm_options_t* options, + const rac_structured_output_config_t* so_config, rac_llm_result_t* out_result); + +/** + * @brief Generate structured output with streaming and grammar constraints + * + * Same as generate_structured but with token-by-token streaming callbacks. + * Each emitted token is guaranteed to conform to the grammar. + * + * @param handle Component handle + * @param prompt Input prompt + * @param options Generation options (can be NULL for defaults) + * @param so_config Structured output config with JSON schema + * @param token_callback Called for each generated token + * @param complete_callback Called when generation completes + * @param error_callback Called on error + * @param user_data User context passed to callbacks + * @return RAC_SUCCESS or error code + */ +RAC_API rac_result_t rac_llm_component_generate_structured_stream( + rac_handle_t handle, const char* prompt, const rac_llm_options_t* options, + const rac_structured_output_config_t* so_config, + rac_llm_component_token_callback_fn token_callback, + rac_llm_component_complete_callback_fn complete_callback, + rac_llm_component_error_callback_fn error_callback, void* user_data); + /** * @brief Check if streaming is supported * diff --git a/sdk/runanywhere-swift/Sources/RunAnywhere/CRACommons/include/rac_llm_service.h b/sdk/runanywhere-swift/Sources/RunAnywhere/CRACommons/include/rac_llm_service.h index f353b5fbd..cf518202f 100644 --- a/sdk/runanywhere-swift/Sources/RunAnywhere/CRACommons/include/rac_llm_service.h +++ b/sdk/runanywhere-swift/Sources/RunAnywhere/CRACommons/include/rac_llm_service.h @@ -78,6 +78,12 @@ typedef struct rac_llm_service_ops { /** Clear all KV cache state (optional, NULL if not supported) */ rac_result_t (*clear_context)(void* impl); + + /** + * Convert JSON Schema to GBNF grammar string (optional, NULL if not supported). + * Caller must free out_grammar with rac_free(). + */ + rac_result_t (*json_schema_to_grammar)(void* impl, const char* json_schema, char** out_grammar); } rac_llm_service_ops_t; /** @@ -185,6 +191,21 @@ RAC_API void rac_llm_destroy(rac_handle_t handle); */ RAC_API void rac_llm_result_free(rac_llm_result_t* result); +/** + * @brief Convert JSON Schema to GBNF grammar string + * + * Routes through service registry to the backend's json_schema_to_grammar op. + * The resulting GBNF grammar can be passed in rac_llm_options_t.grammar + * for grammar-constrained decoding. + * + * @param handle Service handle + * @param json_schema JSON Schema string + * @param out_grammar Output: GBNF grammar string (caller must free with rac_free) + * @return RAC_SUCCESS or RAC_ERROR_NOT_SUPPORTED if backend doesn't support grammar + */ +RAC_API rac_result_t rac_llm_json_schema_to_grammar(rac_handle_t handle, const char* json_schema, + char** out_grammar); + // ============================================================================= // ADAPTIVE CONTEXT API - For RAG and similar pipelines // ============================================================================= diff --git a/sdk/runanywhere-swift/Sources/RunAnywhere/CRACommons/include/rac_llm_types.h b/sdk/runanywhere-swift/Sources/RunAnywhere/CRACommons/include/rac_llm_types.h index 04a59ca42..1f3ee29c6 100644 --- a/sdk/runanywhere-swift/Sources/RunAnywhere/CRACommons/include/rac_llm_types.h +++ b/sdk/runanywhere-swift/Sources/RunAnywhere/CRACommons/include/rac_llm_types.h @@ -93,6 +93,9 @@ typedef struct rac_llm_options { /** System prompt (can be NULL) */ const char* system_prompt; + + /** GBNF grammar string for constrained decoding (can be NULL for unconstrained) */ + const char* grammar; } rac_llm_options_t; /** @@ -104,7 +107,8 @@ static const rac_llm_options_t RAC_LLM_OPTIONS_DEFAULT = {.max_tokens = 100, .stop_sequences = RAC_NULL, .num_stop_sequences = 0, .streaming_enabled = RAC_FALSE, - .system_prompt = RAC_NULL}; + .system_prompt = RAC_NULL, + .grammar = RAC_NULL}; // ============================================================================= // RESULT - Mirrors Swift's LLMGenerationResult @@ -209,6 +213,18 @@ static const rac_thinking_tag_pattern_t RAC_THINKING_TAG_FULL = {.opening_tag = // STRUCTURED OUTPUT - Mirrors Swift's StructuredOutputConfig // ============================================================================= +/** + * @brief Fallback strategy when grammar-constrained structured output fails + */ +typedef enum rac_structured_output_fallback { + /** Return raw text output (no parsing attempt) */ + RAC_STRUCTURED_OUTPUT_FALLBACK_RAW = 0, + /** Retry generation with grammar constraint (default) */ + RAC_STRUCTURED_OUTPUT_FALLBACK_RETRY = 1, + /** Fall back to prompt-only mode (no grammar constraint) */ + RAC_STRUCTURED_OUTPUT_FALLBACK_PROMPT_ONLY = 2 +} rac_structured_output_fallback_t; + /** * @brief Structured output configuration * @@ -223,13 +239,26 @@ typedef struct rac_structured_output_config { /** Whether to include the schema in the prompt */ rac_bool_t include_schema_in_prompt; + + /** Enable GBNF grammar-constrained decoding (default: true when json_schema is set) */ + rac_bool_t use_grammar; + + /** Maximum retry attempts on failure (default: 3) */ + int32_t max_retries; + + /** Fallback strategy on failure (default: RETRY) */ + rac_structured_output_fallback_t fallback; } rac_structured_output_config_t; /** * @brief Default structured output configuration */ static const rac_structured_output_config_t RAC_STRUCTURED_OUTPUT_DEFAULT = { - .json_schema = RAC_NULL, .include_schema_in_prompt = RAC_TRUE}; + .json_schema = RAC_NULL, + .include_schema_in_prompt = RAC_TRUE, + .use_grammar = RAC_TRUE, + .max_retries = 3, + .fallback = RAC_STRUCTURED_OUTPUT_FALLBACK_RETRY}; /** * @brief Structured output validation result diff --git a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/LLMTypes.swift b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/LLMTypes.swift index aee411620..8d7cd8762 100644 --- a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/LLMTypes.swift +++ b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/LLMTypes.swift @@ -540,6 +540,16 @@ public extension Generatable { } } +/// Fallback strategy when grammar-constrained structured output fails +public enum StructuredOutputFallback: Int32, Sendable { + /// Return raw text output (no parsing attempt) + case raw = 0 + /// Retry generation with grammar constraint (default) + case retry = 1 + /// Fall back to prompt-only mode (no grammar constraint) + case promptOnly = 2 +} + /// Structured output configuration public struct StructuredOutputConfig: @unchecked Sendable { /// The type to generate @@ -548,12 +558,27 @@ public struct StructuredOutputConfig: @unchecked Sendable { /// Whether to include schema in prompt public let includeSchemaInPrompt: Bool + /// Enable GBNF grammar-constrained decoding (default: true) + public let useGrammar: Bool + + /// Maximum retry attempts on failure (default: 3) + public let maxRetries: Int + + /// Fallback strategy on failure (default: .retry) + public let fallback: StructuredOutputFallback + public init( type: Generatable.Type, - includeSchemaInPrompt: Bool = true + includeSchemaInPrompt: Bool = true, + useGrammar: Bool = true, + maxRetries: Int = 3, + fallback: StructuredOutputFallback = .retry ) { self.type = type self.includeSchemaInPrompt = includeSchemaInPrompt + self.useGrammar = useGrammar + self.maxRetries = maxRetries + self.fallback = fallback } } diff --git a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/RunAnywhere+StructuredOutput.swift b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/RunAnywhere+StructuredOutput.swift index 4f3a27d9f..71b43b01b 100644 --- a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/RunAnywhere+StructuredOutput.swift +++ b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/RunAnywhere+StructuredOutput.swift @@ -224,7 +224,7 @@ public extension RunAnywhere { } } - /// Internal generation for structured output (calls C++ directly) + /// Internal generation for structured output using grammar-constrained decoding private static func generateForStructuredOutput( _ prompt: String, options: LLMGenerationOptions @@ -248,21 +248,59 @@ public extension RunAnywhere { cOptions.temperature = options.temperature cOptions.top_p = options.topP cOptions.streaming_enabled = RAC_FALSE + cOptions.grammar = nil + + // Build structured output C config + var soConfig = rac_structured_output_config_t() + if let structuredOutput = options.structuredOutput { + soConfig.include_schema_in_prompt = structuredOutput.includeSchemaInPrompt ? RAC_TRUE : RAC_FALSE + soConfig.use_grammar = structuredOutput.useGrammar ? RAC_TRUE : RAC_FALSE + soConfig.max_retries = Int32(structuredOutput.maxRetries) + soConfig.fallback = rac_structured_output_fallback(rawValue: UInt32(structuredOutput.fallback.rawValue)) + } else { + soConfig.include_schema_in_prompt = RAC_TRUE + soConfig.use_grammar = RAC_TRUE + soConfig.max_retries = 3 + soConfig.fallback = RAC_STRUCTURED_OUTPUT_FALLBACK_RETRY + } - // Generate - wrap in system_prompt lifetime scope + // Generate using grammar-constrained structured output API var llmResult = rac_llm_result_t() let generateResult: rac_result_t + + // Get JSON schema from the structured output type + let jsonSchema = options.structuredOutput?.type.jsonSchema + if let systemPrompt = options.systemPrompt { generateResult = systemPrompt.withCString { sysPromptPtr in cOptions.system_prompt = sysPromptPtr return prompt.withCString { promptPtr in - rac_llm_component_generate(handle, promptPtr, &cOptions, &llmResult) + if let schema = jsonSchema { + return schema.withCString { schemaPtr in + soConfig.json_schema = schemaPtr + return rac_llm_component_generate_structured( + handle, promptPtr, &cOptions, &soConfig, &llmResult + ) + } + } else { + // No schema - fall back to regular generate + return rac_llm_component_generate(handle, promptPtr, &cOptions, &llmResult) + } } } } else { cOptions.system_prompt = nil generateResult = prompt.withCString { promptPtr in - rac_llm_component_generate(handle, promptPtr, &cOptions, &llmResult) + if let schema = jsonSchema { + return schema.withCString { schemaPtr in + soConfig.json_schema = schemaPtr + return rac_llm_component_generate_structured( + handle, promptPtr, &cOptions, &soConfig, &llmResult + ) + } + } else { + return rac_llm_component_generate(handle, promptPtr, &cOptions, &llmResult) + } } } diff --git a/sdk/runanywhere-web/packages/core/src/Foundation/StructOffsets.ts b/sdk/runanywhere-web/packages/core/src/Foundation/StructOffsets.ts index 5ccc00fc9..e5fe04a9c 100644 --- a/sdk/runanywhere-web/packages/core/src/Foundation/StructOffsets.ts +++ b/sdk/runanywhere-web/packages/core/src/Foundation/StructOffsets.ts @@ -13,12 +13,12 @@ // --------------------------------------------------------------------------- export interface ConfigOffsets { logLevel: number; } -export interface LLMOptionsOffsets { maxTokens: number; temperature: number; topP: number; systemPrompt: number; } +export interface LLMOptionsOffsets { maxTokens: number; temperature: number; topP: number; systemPrompt: number; grammar: number; } export interface LLMResultOffsets { text: number; promptTokens: number; completionTokens: number; } export interface VLMImageOffsets { format: number; filePath: number; pixelData: number; base64Data: number; width: number; height: number; dataSize: number; } export interface VLMOptionsOffsets { maxTokens: number; temperature: number; topP: number; streamingEnabled: number; systemPrompt: number; modelFamily: number; } export interface VLMResultOffsets { text: number; promptTokens: number; imageTokens: number; completionTokens: number; totalTokens: number; timeToFirstTokenMs: number; imageEncodeTimeMs: number; totalTimeMs: number; tokensPerSecond: number; } -export interface StructuredOutputConfigOffsets { jsonSchema: number; includeSchemaInPrompt: number; } +export interface StructuredOutputConfigOffsets { jsonSchema: number; includeSchemaInPrompt: number; useGrammar: number; maxRetries: number; fallback: number; } export interface StructuredOutputValidationOffsets { isValid: number; errorMessage: number; extractedJson: number; } export interface EmbeddingsOptionsOffsets { normalize: number; pooling: number; nThreads: number; } export interface EmbeddingsResultOffsets { embeddings: number; numEmbeddings: number; dimension: number; processingTimeMs: number; totalTokens: number; } diff --git a/sdk/runanywhere-web/packages/llamacpp/src/Extensions/RunAnywhere+StructuredOutput.ts b/sdk/runanywhere-web/packages/llamacpp/src/Extensions/RunAnywhere+StructuredOutput.ts index 04f98f89d..6e0d2a167 100644 --- a/sdk/runanywhere-web/packages/llamacpp/src/Extensions/RunAnywhere+StructuredOutput.ts +++ b/sdk/runanywhere-web/packages/llamacpp/src/Extensions/RunAnywhere+StructuredOutput.ts @@ -32,11 +32,27 @@ function requireBridge(): LlamaCppBridge { // Structured Output Types // --------------------------------------------------------------------------- +/** Fallback strategy when grammar-constrained decoding fails */ +export enum StructuredOutputFallback { + /** Return raw output without validation */ + Raw = 0, + /** Retry generation (default) */ + Retry = 1, + /** Fall back to prompt-only mode (no grammar constraint) */ + PromptOnly = 2, +} + export interface StructuredOutputConfig { /** JSON Schema string */ jsonSchema: string; /** Whether to include the schema in the prompt (default: true) */ includeSchemaInPrompt?: boolean; + /** Whether to use GBNF grammar-constrained decoding (default: true) */ + useGrammar?: boolean; + /** Maximum retries for structured output parsing (default: 3) */ + maxRetries?: number; + /** Fallback strategy when grammar fails (default: Retry) */ + fallback?: StructuredOutputFallback; } export interface StructuredOutputValidation { @@ -106,6 +122,9 @@ export const StructuredOutput = { const schemaPtr = bridge.allocString(config.jsonSchema); m.setValue(configPtr + soConf.jsonSchema, schemaPtr, '*'); m.setValue(configPtr + soConf.includeSchemaInPrompt, (config.includeSchemaInPrompt !== false) ? 1 : 0, 'i32'); + m.setValue(configPtr + soConf.useGrammar, (config.useGrammar !== false) ? 1 : 0, 'i32'); + m.setValue(configPtr + soConf.maxRetries, config.maxRetries ?? 3, 'i32'); + m.setValue(configPtr + soConf.fallback, config.fallback ?? StructuredOutputFallback.Retry, 'i32'); const outPromptPtr = m._malloc(4); @@ -188,6 +207,9 @@ export const StructuredOutput = { const schemaPtr = bridge.allocString(config.jsonSchema); m.setValue(configPtr + soConf2.jsonSchema, schemaPtr, '*'); m.setValue(configPtr + soConf2.includeSchemaInPrompt, (config.includeSchemaInPrompt !== false) ? 1 : 0, 'i32'); + m.setValue(configPtr + soConf2.useGrammar, (config.useGrammar !== false) ? 1 : 0, 'i32'); + m.setValue(configPtr + soConf2.maxRetries, config.maxRetries ?? 3, 'i32'); + m.setValue(configPtr + soConf2.fallback, config.fallback ?? StructuredOutputFallback.Retry, 'i32'); // rac_structured_output_validation_t (size from sizeof helper) const valSize = 12; // 3 fields × 4 bytes on wasm32 — all i32/ptr diff --git a/sdk/runanywhere-web/packages/llamacpp/src/Foundation/LlamaCppOffsets.ts b/sdk/runanywhere-web/packages/llamacpp/src/Foundation/LlamaCppOffsets.ts index 0ecaa0be7..d9509d6fe 100644 --- a/sdk/runanywhere-web/packages/llamacpp/src/Foundation/LlamaCppOffsets.ts +++ b/sdk/runanywhere-web/packages/llamacpp/src/Foundation/LlamaCppOffsets.ts @@ -44,6 +44,7 @@ function loadLLMOptionsOffsets(): LLMOptionsOffsets { temperature: off('llm_options_temperature'), topP: off('llm_options_top_p'), systemPrompt: off('llm_options_system_prompt'), + grammar: off('llm_options_grammar'), }; } @@ -96,6 +97,9 @@ function loadStructuredOutputConfigOffsets(): StructuredOutputConfigOffsets { return { jsonSchema: off('structured_output_config_json_schema'), includeSchemaInPrompt: off('structured_output_config_include_schema_in_prompt'), + useGrammar: off('structured_output_config_use_grammar'), + maxRetries: off('structured_output_config_max_retries'), + fallback: off('structured_output_config_fallback'), }; } diff --git a/sdk/runanywhere-web/packages/llamacpp/src/index.ts b/sdk/runanywhere-web/packages/llamacpp/src/index.ts index 8a1e282c2..4f41fcdc9 100644 --- a/sdk/runanywhere-web/packages/llamacpp/src/index.ts +++ b/sdk/runanywhere-web/packages/llamacpp/src/index.ts @@ -39,7 +39,7 @@ export type { ToolValue, ToolParameterType, ToolParameter, ToolDefinition, ToolCall, ToolResult, ToolCallingOptions, ToolCallingResult, ToolExecutor, } from './Extensions/RunAnywhere+ToolCalling'; -export { StructuredOutput } from './Extensions/RunAnywhere+StructuredOutput'; +export { StructuredOutput, StructuredOutputFallback } from './Extensions/RunAnywhere+StructuredOutput'; export type { StructuredOutputConfig, StructuredOutputValidation } from './Extensions/RunAnywhere+StructuredOutput'; export { Diffusion } from './Extensions/RunAnywhere+Diffusion'; export { DiffusionScheduler, DiffusionModelVariant, DiffusionMode } from './Extensions/RunAnywhere+Diffusion'; diff --git a/sdk/runanywhere-web/wasm/src/wasm_exports.cpp b/sdk/runanywhere-web/wasm/src/wasm_exports.cpp index 430790a22..1c55f6b67 100644 --- a/sdk/runanywhere-web/wasm/src/wasm_exports.cpp +++ b/sdk/runanywhere-web/wasm/src/wasm_exports.cpp @@ -302,6 +302,9 @@ EMSCRIPTEN_KEEPALIVE int rac_wasm_offsetof_llm_options_top_p(void) { EMSCRIPTEN_KEEPALIVE int rac_wasm_offsetof_llm_options_system_prompt(void) { return (int)offsetof(rac_llm_options_t, system_prompt); } +EMSCRIPTEN_KEEPALIVE int rac_wasm_offsetof_llm_options_grammar(void) { + return (int)offsetof(rac_llm_options_t, grammar); +} // ---- rac_llm_result_t ---- EMSCRIPTEN_KEEPALIVE int rac_wasm_offsetof_llm_result_text(void) { @@ -393,6 +396,15 @@ EMSCRIPTEN_KEEPALIVE int rac_wasm_offsetof_structured_output_config_json_schema( EMSCRIPTEN_KEEPALIVE int rac_wasm_offsetof_structured_output_config_include_schema(void) { return (int)offsetof(rac_structured_output_config_t, include_schema_in_prompt); } +EMSCRIPTEN_KEEPALIVE int rac_wasm_offsetof_structured_output_config_use_grammar(void) { + return (int)offsetof(rac_structured_output_config_t, use_grammar); +} +EMSCRIPTEN_KEEPALIVE int rac_wasm_offsetof_structured_output_config_max_retries(void) { + return (int)offsetof(rac_structured_output_config_t, max_retries); +} +EMSCRIPTEN_KEEPALIVE int rac_wasm_offsetof_structured_output_config_fallback(void) { + return (int)offsetof(rac_structured_output_config_t, fallback); +} // ---- rac_structured_output_validation_t ---- EMSCRIPTEN_KEEPALIVE int rac_wasm_offsetof_structured_output_validation_is_valid(void) {