44
55#include < algorithm>
66#include < chrono>
7+ #include < climits>
78#include < cstring>
89#include < string>
910#include < vector>
@@ -239,70 +240,130 @@ bool LlamaCppTextGeneration::load_model(const std::string& model_path,
239240 model_params.use_mmap = false ;
240241#endif
241242
242- // Detect model size from filename to set appropriate GPU layers BEFORE loading
243- // This prevents OOM crashes on mobile devices with limited GPU memory
244- // Note: We use filename heuristics here because we can't know param count until after loading
245- std::string path_lower = model_path;
246- std::transform (path_lower.begin (), path_lower.end (), path_lower.begin (), ::tolower);
247-
248- int gpu_layers = -1 ; // Default: all layers to GPU
249-
250- // Check for large model indicators in filename using word boundary detection
251- // Patterns like "7b", "8b", "13b" should match at word boundaries to avoid
252- // false positives like "/backup7b/" or "/2017beta/"
253- auto is_model_size_marker = [&path_lower](const char * marker) {
254- size_t pos = path_lower.find (marker);
255- while (pos != std::string::npos) {
256- // Check for word boundary before (start of string, or non-alphanumeric)
257- bool valid_start = (pos == 0 ) || !std::isalnum (path_lower[pos - 1 ]);
258- // Check for word boundary after (end of string, or non-alphanumeric except digits for patterns like "7b-q4")
259- size_t end_pos = pos + strlen (marker);
260- bool valid_end = (end_pos >= path_lower.size ()) ||
261- (!std::isalpha (path_lower[end_pos]) || path_lower[end_pos] == ' -' || path_lower[end_pos] == ' _' );
262-
263- if (valid_start && valid_end) {
264- return true ;
265- }
266- pos = path_lower.find (marker, pos + 1 );
267- }
268- return false ;
269- };
243+ // If llama_params_fit aborts, use the user-provided value
244+ int user_gpu_layers = -1 ; // -1 = not set by user
245+ if (config.contains (" gpu_layers" )) {
246+ user_gpu_layers = config[" gpu_layers" ].get <int >();
247+ RAC_LOG_INFO (" LLM.LlamaCpp" , " User-provided GPU layers: %d (will apply after fit)" , user_gpu_layers);
248+ }
270249
271- // Detect large models (7B+) that may need GPU layer limiting on mobile
272- // First check for config-based override (for custom-named models)
273- bool is_large_model = false ;
274- if (config.contains (" expected_params_billions" )) {
275- double expected_params = config[" expected_params_billions" ].get <double >();
276- is_large_model = (expected_params >= kLargeModelThresholdB );
277- if (is_large_model) {
278- RAC_LOG_INFO (" LLM.LlamaCpp" ," Large model detected from config (%.1fB expected params)" , expected_params);
279- }
250+ // Set up context params early for llama_params_fit
251+ llama_context_params ctx_params = llama_context_default_params ();
252+ ctx_params.n_ctx = 0 ;
253+ ctx_params.n_threads = backend_->get_num_threads ();
254+ ctx_params.n_threads_batch = backend_->get_num_threads ();
255+ ctx_params.no_perf = true ;
256+
257+ if (user_context_size > 0 ) {
258+ ctx_params.n_ctx = user_context_size;
259+ RAC_LOG_INFO (" LLM.LlamaCpp" , " User-provided context size: %d" , user_context_size);
280260 }
281261
282- // Fall back to filename heuristics if no config provided
283- if (!is_large_model) {
284- is_large_model = is_model_size_marker (" 7b" ) ||
285- is_model_size_marker (" 8b" ) ||
286- is_model_size_marker (" 9b" ) ||
287- is_model_size_marker (" 13b" ) ||
288- is_model_size_marker (" 70b" );
262+ size_t n_devices = llama_max_devices ();
263+ size_t n_overrides = llama_max_tensor_buft_overrides ();
264+
265+ std::vector<float > tensor_split (n_devices, 0 .0f );
266+ // llama.cpp iterates tensor_buft_overrides until it hits a zero-valued
267+ // sentinel entry (pattern == nullptr). Value-initializing the vector to
268+ // all zeros means the first element is already that sentinel, so an
269+ // empty vector is interpreted as "no tensor buft overrides."
270+ std::vector<llama_model_tensor_buft_override> tensor_buft_overrides (n_overrides);
271+ std::vector<size_t > margins (n_devices, 0 );
272+
273+ size_t margin_mib = 1024 ; // Configurable parameter
274+ if (config.contains (" fit_margin_mib" )) {
275+ margin_mib = config[" fit_margin_mib" ].get <size_t >();
276+ }
277+ for (size_t i = 0 ; i < n_devices; ++i) {
278+ margins[i] = margin_mib * 1024 * 1024 ;
289279 }
290280
291- if (is_large_model) {
292- // For 7B+ models on mobile: limit GPU layers to prevent OOM
293- // Most 7B models have 32 layers, offload ~24 to GPU, rest to CPU
294- gpu_layers = kLargeModelGpuLayers ;
295- RAC_LOG_INFO (" LLM.LlamaCpp" ," Large model detected, limiting GPU layers to %d to prevent OOM" , gpu_layers);
281+ uint32_t n_ctx_min = 2048 ; // Configurable parameter
282+ if (config.contains (" n_ctx_min" )) {
283+ n_ctx_min = config[" n_ctx_min" ].get <uint32_t >();
296284 }
297285
298- // Allow user override via config
299- if (config.contains (" gpu_layers" )) {
300- gpu_layers = config[" gpu_layers" ].get <int >();
301- RAC_LOG_INFO (" LLM.LlamaCpp" ," Using user-provided GPU layers: %d" , gpu_layers);
286+ RAC_LOG_INFO (" LLM.LlamaCpp" , " Calling llama_params_fit (margin=%zuMiB, n_ctx_min=%u, n_devices=%zu)" ,
287+ margin_mib, n_ctx_min, n_devices);
288+
289+ llama_params_fit_status fit_status = llama_params_fit (
290+ model_path.c_str (),
291+ &model_params,
292+ &ctx_params,
293+ tensor_split.data (),
294+ tensor_buft_overrides.data (),
295+ margins.data (),
296+ n_ctx_min,
297+ GGML_LOG_LEVEL_INFO
298+ );
299+
300+ switch (fit_status) {
301+ case LLAMA_PARAMS_FIT_STATUS_SUCCESS:
302+ RAC_LOG_INFO (" LLM.LlamaCpp" , " llama_params_fit SUCCESS: n_gpu_layers=%d, n_ctx=%u" ,
303+ model_params.n_gpu_layers , ctx_params.n_ctx );
304+ break ;
305+ case LLAMA_PARAMS_FIT_STATUS_FAILURE:
306+ RAC_LOG_INFO (" LLM.LlamaCpp" , " llama_params_fit FAILURE: could not fit model to device memory. "
307+ " Proceeding with conservative CPU-only defaults." );
308+ model_params.n_gpu_layers = 0 ;
309+ if (user_context_size > 0 && user_context_size > 2048 ) {
310+ RAC_LOG_INFO (" LLM.LlamaCpp" , " Capping user-requested context_size=%d to 2048 after fit FAILURE" ,
311+ user_context_size);
312+ }
313+ if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 2048 ) {
314+ ctx_params.n_ctx = 2048 ;
315+ }
316+ break ;
317+ case LLAMA_PARAMS_FIT_STATUS_ERROR:
318+ RAC_LOG_ERROR (" LLM.LlamaCpp" , " llama_params_fit ERROR for model: %s. "
319+ " Falling back to conservative CPU-only defaults." , model_path.c_str ());
320+ model_params.n_gpu_layers = 0 ;
321+ if (user_context_size > 0 && user_context_size > 2048 ) {
322+ RAC_LOG_INFO (" LLM.LlamaCpp" , " Capping user-requested context_size=%d to 2048 after fit ERROR" ,
323+ user_context_size);
324+ }
325+ if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 2048 ) {
326+ ctx_params.n_ctx = 2048 ;
327+ }
328+ break ;
329+ }
330+
331+ // Apply user gpu_layers override after fit, respecting the CPU-only build constraint.
332+ // llama_params_fit does not yet account for host memory in CPU-only builds
333+ // (upstream PR: https://github.com/ggml-org/llama.cpp/pull/19711).
334+ #if !defined(GGML_USE_METAL) && !defined(GGML_USE_CUDA) && !defined(GGML_USE_WEBGPU)
335+ if (fit_status == LLAMA_PARAMS_FIT_STATUS_SUCCESS) {
336+ RAC_LOG_INFO (" LLM.LlamaCpp" , " CPU-only build: llama_params_fit fitted to GPU memory but no GPU backend active. "
337+ " Applying conservative CPU defaults." );
338+ }
339+ if (user_gpu_layers > 0 ) {
340+ RAC_LOG_INFO (" LLM.LlamaCpp" , " CPU-only build: ignoring user gpu_layers=%d (no GPU backend available)" ,
341+ user_gpu_layers);
342+ }
343+ model_params.n_gpu_layers = 0 ;
344+ if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 4096 ) {
345+ ctx_params.n_ctx = 4096 ;
346+ RAC_LOG_INFO (" LLM.LlamaCpp" , " CPU-only: capping context to %u" , ctx_params.n_ctx );
347+ }
348+ #else
349+ if (user_gpu_layers >= 0 ) {
350+ // llama_params_fit fell back to n_gpu_layers=0 for non-SUCCESS outcomes;
351+ // honouring the user override here reinstates the OOM risk the fit call
352+ // was supposed to prevent. Log a warning so it's visible in the event of
353+ // a subsequent crash/OOM, but keep honouring the user's explicit request.
354+ if (fit_status != LLAMA_PARAMS_FIT_STATUS_SUCCESS) {
355+ const char * fit_label =
356+ fit_status == LLAMA_PARAMS_FIT_STATUS_FAILURE ? " FAILURE" : " ERROR" ;
357+ RAC_LOG_WARNING (" LLM.LlamaCpp" ,
358+ " Applying user gpu_layers=%d override despite llama_params_fit %s — risk of OOM" ,
359+ user_gpu_layers, fit_label);
360+ }
361+ model_params.n_gpu_layers = user_gpu_layers;
362+ RAC_LOG_INFO (" LLM.LlamaCpp" , " Applying user GPU layers override: %d" , user_gpu_layers);
302363 }
364+ #endif
303365
304- model_params.n_gpu_layers = gpu_layers;
305- RAC_LOG_INFO (" LLM.LlamaCpp" ," Loading model with n_gpu_layers=%d" , gpu_layers);
366+ RAC_LOG_INFO (" LLM.LlamaCpp" , " Loading model with n_gpu_layers=%d" , model_params.n_gpu_layers );
306367
307368 model_ = llama_model_load_from_file (model_path.c_str (), model_params);
308369
@@ -312,64 +373,30 @@ bool LlamaCppTextGeneration::load_model(const std::string& model_path,
312373 }
313374
314375 int model_train_ctx = llama_model_n_ctx_train (model_);
315- RAC_LOG_INFO (" LLM.LlamaCpp" ," Model training context size: %d" , model_train_ctx);
316-
317- // Get model parameter count to determine appropriate context size
318- // Large models (7B+) need smaller context on mobile to fit in memory
319376 uint64_t n_params = llama_model_n_params (model_);
320377 double params_billions = static_cast <double >(n_params) / 1e9 ;
321- RAC_LOG_INFO (" LLM.LlamaCpp" ," Model parameters: %.2fB" , params_billions);
322-
323- // Post-load verification: warn if actual param count differs from filename heuristic
324- bool actual_is_large = (params_billions >= kLargeModelThresholdB );
325- if (actual_is_large && !is_large_model) {
326- RAC_LOG_INFO (" LLM.LlamaCpp" ," WARNING: Model has %.1fB params but filename didn't indicate large model. "
327- " Consider using gpu_layers config for optimal performance." , params_billions);
328- } else if (!actual_is_large && is_large_model) {
329- RAC_LOG_INFO (" LLM.LlamaCpp" ," NOTE: Filename suggested large model but actual params are %.1fB. "
330- " GPU layer limiting may be conservative." , params_billions);
331- }
332-
333- // Adaptive context size based on model size for mobile devices
334- int adaptive_max_context;
335- if (params_billions >= kLargeModelThresholdB ) {
336- adaptive_max_context = kLargeModelContextSize ;
337- RAC_LOG_INFO (" LLM.LlamaCpp" ," Large model detected (%.1fB params), limiting context to %d for memory" , params_billions, adaptive_max_context);
338- } else if (params_billions >= kMediumModelThresholdB ) {
339- adaptive_max_context = kMediumModelContextSize ;
340- RAC_LOG_INFO (" LLM.LlamaCpp" ," Medium model detected (%.1fB params), limiting context to %d" , params_billions, adaptive_max_context);
341- } else if (params_billions >= kSmallModelThresholdB ) {
342- adaptive_max_context = kSmallModelContextSize ;
343- RAC_LOG_INFO (" LLM.LlamaCpp" ," Small-medium model detected (%.1fB params), limiting context to %d" , params_billions, adaptive_max_context);
344- } else {
345- // Tiny models (<1B): can use larger context
346- adaptive_max_context = max_default_context_;
347- }
378+ RAC_LOG_INFO (" LLM.LlamaCpp" , " Model loaded: %.2fB params, training context=%d" , params_billions, model_train_ctx);
348379
349- if (user_context_size > 0 ) {
350- context_size_ = std::min (user_context_size, model_train_ctx);
351- RAC_LOG_INFO (" LLM.LlamaCpp" ," Using user-provided context size: %d (requested: %d, model max: %d)" , context_size_,
352- user_context_size, model_train_ctx);
353- } else {
354- context_size_ = std::min ({model_train_ctx, max_default_context_, adaptive_max_context});
355- RAC_LOG_INFO (" LLM.LlamaCpp" ," Auto-detected context size: %d (model: %d, cap: %d, adaptive: %d)" , context_size_,
356- model_train_ctx, max_default_context_, adaptive_max_context);
380+ if (ctx_params.n_ctx == 0 ) {
381+ ctx_params.n_ctx = std::min (model_train_ctx, max_default_context_);
357382 }
383+ // ctx_params.n_ctx is uint32_t; clamp to INT_MAX before converting to int so
384+ // a pathological fitted/user value above ~2.1B can't wrap to a negative
385+ // number that `std::min` would then pick as the "smallest" context size.
386+ const int fitted_ctx = static_cast <int >(
387+ std::min (ctx_params.n_ctx , static_cast <uint32_t >(INT_MAX)));
388+ context_size_ = std::min ({fitted_ctx, model_train_ctx, max_default_context_});
389+
390+ RAC_LOG_INFO (" LLM.LlamaCpp" , " Final context size: %d (fitted=%u, train=%d, cap=%d)" ,
391+ context_size_, ctx_params.n_ctx , model_train_ctx, max_default_context_);
358392
359- // Cap batch sizes to avoid exceeding Metal's 4 GB single-buffer limit on iOS.
360- // n_ctx controls conversation history length; n_batch/n_ubatch control how many
361- // tokens are processed in one GPU pass. llama.cpp splits larger prompts automatically.
362393 static constexpr int MAX_BATCH_SIZE = 2048 ;
363394 static constexpr int MAX_UBATCH_SIZE = 512 ;
364395 batch_size_ = std::min (context_size_, MAX_BATCH_SIZE);
365396
366- llama_context_params ctx_params = llama_context_default_params ();
367397 ctx_params.n_ctx = context_size_;
368398 ctx_params.n_batch = batch_size_;
369399 ctx_params.n_ubatch = std::min (batch_size_, MAX_UBATCH_SIZE);
370- ctx_params.n_threads = backend_->get_num_threads ();
371- ctx_params.n_threads_batch = backend_->get_num_threads ();
372- ctx_params.no_perf = true ;
373400
374401 RAC_LOG_INFO (" LLM.LlamaCpp" , " Context params: n_ctx=%d, n_batch=%d, n_ubatch=%d" ,
375402 ctx_params.n_ctx , ctx_params.n_batch , ctx_params.n_ubatch );
@@ -1194,7 +1221,6 @@ bool LlamaCppTextGeneration::recreate_context() {
11941221 context_ = nullptr ;
11951222 }
11961223
1197- // Create new context (adapters are now visible to it)
11981224 llama_context_params ctx_params = llama_context_default_params ();
11991225 ctx_params.n_ctx = context_size_;
12001226 ctx_params.n_batch = batch_size_;
0 commit comments