@@ -207,70 +207,101 @@ bool LlamaCppTextGeneration::load_model(const std::string& model_path,
207207 model_params.use_mmap = false ;
208208#endif
209209
210- // Detect model size from filename to set appropriate GPU layers BEFORE loading
211- // This prevents OOM crashes on mobile devices with limited GPU memory
212- // Note: We use filename heuristics here because we can't know param count until after loading
213- std::string path_lower = model_path;
214- std::transform (path_lower.begin (), path_lower.end (), path_lower.begin (), ::tolower);
215-
216- int gpu_layers = -1 ; // Default: all layers to GPU
217-
218- // Check for large model indicators in filename using word boundary detection
219- // Patterns like "7b", "8b", "13b" should match at word boundaries to avoid
220- // false positives like "/backup7b/" or "/2017beta/"
221- auto is_model_size_marker = [&path_lower](const char * marker) {
222- size_t pos = path_lower.find (marker);
223- while (pos != std::string::npos) {
224- // Check for word boundary before (start of string, or non-alphanumeric)
225- bool valid_start = (pos == 0 ) || !std::isalnum (path_lower[pos - 1 ]);
226- // Check for word boundary after (end of string, or non-alphanumeric except digits for patterns like "7b-q4")
227- size_t end_pos = pos + strlen (marker);
228- bool valid_end = (end_pos >= path_lower.size ()) ||
229- (!std::isalpha (path_lower[end_pos]) || path_lower[end_pos] == ' -' || path_lower[end_pos] == ' _' );
230-
231- if (valid_start && valid_end) {
232- return true ;
233- }
234- pos = path_lower.find (marker, pos + 1 );
235- }
236- return false ;
237- };
210+ // If llama_params_fits aborts, use the user provided value
211+ int user_gpu_layers = -1 ; // -1 = not set by user
212+ if (config.contains (" gpu_layers" )) {
213+ user_gpu_layers = config[" gpu_layers" ].get <int >();
214+ LOGI (" User-provided GPU layers: %d (will apply after fit)" , user_gpu_layers);
215+ }
238216
239- // Detect large models (7B+) that may need GPU layer limiting on mobile
240- // First check for config-based override (for custom-named models)
241- bool is_large_model = false ;
242- if (config.contains (" expected_params_billions" )) {
243- double expected_params = config[" expected_params_billions" ].get <double >();
244- is_large_model = (expected_params >= 7.0 );
245- if (is_large_model) {
246- LOGI (" Large model detected from config (%.1fB expected params)" , expected_params);
247- }
217+ // Set up context params early for llama_params_fit
218+ llama_context_params ctx_params = llama_context_default_params ();
219+ ctx_params.n_ctx = 0 ;
220+ ctx_params.n_threads = backend_->get_num_threads ();
221+ ctx_params.n_threads_batch = backend_->get_num_threads ();
222+ ctx_params.no_perf = true ;
223+
224+ if (user_context_size > 0 ) {
225+ ctx_params.n_ctx = user_context_size;
226+ LOGI (" User-provided context size: %d" , user_context_size);
248227 }
249228
250- // Fall back to filename heuristics if no config provided
251- if (!is_large_model) {
252- is_large_model = is_model_size_marker (" 7b" ) ||
253- is_model_size_marker (" 8b" ) ||
254- is_model_size_marker (" 9b" ) ||
255- is_model_size_marker (" 13b" ) ||
256- is_model_size_marker (" 70b" );
229+ size_t n_devices = llama_max_devices ();
230+ size_t n_overrides = llama_max_tensor_buft_overrides ();
231+
232+ std::vector<float > tensor_split (n_devices, 0 .0f );
233+ std::vector<llama_model_tensor_buft_override> tensor_buft_overrides (n_overrides);
234+ std::vector<size_t > margins (n_devices, 0 );
235+
236+ size_t margin_mib = 1024 ; // Configurable parameter
237+ if (config.contains (" fit_margin_mib" )) {
238+ margin_mib = config[" fit_margin_mib" ].get <size_t >();
239+ }
240+ for (size_t i = 0 ; i < n_devices; ++i) {
241+ margins[i] = margin_mib * 1024 * 1024 ;
257242 }
258243
259- if (is_large_model) {
260- // For 7B+ models on mobile: limit GPU layers to prevent OOM
261- // Most 7B models have 32 layers, offload ~24 to GPU, rest to CPU
262- gpu_layers = 24 ;
263- LOGI (" Large model detected, limiting GPU layers to %d to prevent OOM" , gpu_layers);
244+ uint32_t n_ctx_min = 2048 ; // Configurable parameter
245+
246+ LOGI (" Calling llama_params_fit (margin=%zuMiB, n_ctx_min=%u, n_devices=%zu)" ,
247+ margin_mib, n_ctx_min, n_devices);
248+
249+ llama_params_fit_status fit_status = llama_params_fit (
250+ model_path.c_str (),
251+ &model_params,
252+ &ctx_params,
253+ tensor_split.data (),
254+ tensor_buft_overrides.data (),
255+ margins.data (),
256+ n_ctx_min,
257+ GGML_LOG_LEVEL_INFO
258+ );
259+
260+ switch (fit_status) {
261+ case LLAMA_PARAMS_FIT_STATUS_SUCCESS:
262+ LOGI (" llama_params_fit SUCCESS: n_gpu_layers=%d, n_ctx=%u" ,
263+ model_params.n_gpu_layers , ctx_params.n_ctx );
264+ break ;
265+ case LLAMA_PARAMS_FIT_STATUS_FAILURE:
266+ LOGI (" llama_params_fit FAILURE: could not fit model to device memory. "
267+ " Proceeding with conservative CPU-only defaults." );
268+ model_params.n_gpu_layers = 0 ;
269+ if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 2048 ) {
270+ ctx_params.n_ctx = 2048 ;
271+ }
272+ break ;
273+ case LLAMA_PARAMS_FIT_STATUS_ERROR:
274+ LOGE (" llama_params_fit ERROR for model: %s. "
275+ " Falling back to conservative CPU-only defaults." , model_path.c_str ());
276+ model_params.n_gpu_layers = 0 ;
277+ if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 2048 ) {
278+ ctx_params.n_ctx = 2048 ;
279+ }
280+ break ;
264281 }
265282
266- // Allow user override via config
267- if (config.contains (" gpu_layers" )) {
268- gpu_layers = config[" gpu_layers" ].get <int >();
269- LOGI (" Using user-provided GPU layers: %d" , gpu_layers);
283+ // Apply user gpu_layers override after fit
284+ if (user_gpu_layers >= 0 ) {
285+ model_params.n_gpu_layers = user_gpu_layers;
286+ LOGI (" Applying user GPU layers override: %d" , user_gpu_layers);
287+ }
288+
289+ // Currently llama_params_fit does not detect cpu only memory
290+ // They have an ongoing pr, which when merged, should solve our problem.
291+ // this should work as a placeholder until then.
292+ #if !defined(GGML_USE_METAL) && !defined(GGML_USE_CUDA) && !defined(GGML_USE_WEBGPU)
293+ if (fit_status == LLAMA_PARAMS_FIT_STATUS_SUCCESS) {
294+ LOGI (" CPU-only build: llama_params_fit fitted to GPU memory but no GPU backend active. "
295+ " Applying conservative CPU defaults." );
296+ }
297+ model_params.n_gpu_layers = 0 ;
298+ if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 4096 ) {
299+ ctx_params.n_ctx = 4096 ;
300+ LOGI (" CPU-only: capping context to %u" , ctx_params.n_ctx );
270301 }
302+ #endif
271303
272- model_params.n_gpu_layers = gpu_layers;
273- LOGI (" Loading model with n_gpu_layers=%d" , gpu_layers);
304+ LOGI (" Loading model with n_gpu_layers=%d" , model_params.n_gpu_layers );
274305
275306 model_ = llama_model_load_from_file (model_path.c_str (), model_params);
276307
@@ -280,60 +311,24 @@ bool LlamaCppTextGeneration::load_model(const std::string& model_path,
280311 }
281312
282313 int model_train_ctx = llama_model_n_ctx_train (model_);
283- LOGI (" Model training context size: %d" , model_train_ctx);
284-
285- // Get model parameter count to determine appropriate context size
286- // Large models (7B+) need smaller context on mobile to fit in memory
287314 uint64_t n_params = llama_model_n_params (model_);
288315 double params_billions = static_cast <double >(n_params) / 1e9 ;
289- LOGI (" Model parameters: %.2fB" , params_billions);
290-
291- // Post-load verification: warn if actual param count differs from filename heuristic
292- bool actual_is_large = (params_billions >= 7.0 );
293- if (actual_is_large && !is_large_model) {
294- LOGI (" WARNING: Model has %.1fB params but filename didn't indicate large model. "
295- " Consider using gpu_layers config for optimal performance." , params_billions);
296- } else if (!actual_is_large && is_large_model) {
297- LOGI (" NOTE: Filename suggested large model but actual params are %.1fB. "
298- " GPU layer limiting may be conservative." , params_billions);
299- }
300-
301- // Adaptive context size based on model size for mobile devices
302- int adaptive_max_context;
303- if (params_billions >= 7.0 ) {
304- // 7B+ models: use 2048 context to fit in ~6GB GPU memory
305- adaptive_max_context = 2048 ;
306- LOGI (" Large model detected (%.1fB params), limiting context to %d for memory" , params_billions, adaptive_max_context);
307- } else if (params_billions >= 3.0 ) {
308- // 3-7B models: use 4096 context
309- adaptive_max_context = 4096 ;
310- LOGI (" Medium model detected (%.1fB params), limiting context to %d" , params_billions, adaptive_max_context);
311- } else if (params_billions >= 1.0 ) {
312- // 1-3B models: use 2048 context (higher values OOM on mobile, especially with LoRA)
313- adaptive_max_context = 2048 ;
314- LOGI (" Small-medium model detected (%.1fB params), limiting context to %d" , params_billions, adaptive_max_context);
315- } else {
316- // Tiny models (<1B): can use larger context
317- adaptive_max_context = max_default_context_;
318- }
316+ LOGI (" Model loaded: %.2fB params, training context=%d" , params_billions, model_train_ctx);
319317
320- if (user_context_size > 0 ) {
321- context_size_ = std::min (user_context_size, model_train_ctx);
322- LOGI (" Using user-provided context size: %d (requested: %d, model max: %d)" , context_size_,
323- user_context_size, model_train_ctx);
324- } else {
325- context_size_ = std::min ({model_train_ctx, max_default_context_, adaptive_max_context});
326- LOGI (" Auto-detected context size: %d (model: %d, cap: %d, adaptive: %d)" , context_size_,
327- model_train_ctx, max_default_context_, adaptive_max_context);
318+ if (ctx_params.n_ctx == 0 ) {
319+ ctx_params.n_ctx = std::min (model_train_ctx, max_default_context_);
328320 }
321+ context_size_ = std::min ({(int )ctx_params.n_ctx , model_train_ctx, max_default_context_});
322+
323+ LOGI (" Final context size: %d (fitted=%u, train=%d, cap=%d)" ,
324+ context_size_, ctx_params.n_ctx , model_train_ctx, max_default_context_);
325+
326+ int max_safe_batch = 2048 ; // Configurable parameter
327+ int safe_batch_size = std::min (context_size_, max_safe_batch);
329328
330- llama_context_params ctx_params = llama_context_default_params ();
331329 ctx_params.n_ctx = context_size_;
332- ctx_params.n_batch = context_size_; // Allow processing full prompt at once
333- ctx_params.n_ubatch = context_size_; // Physical batch size must also match
334- ctx_params.n_threads = backend_->get_num_threads ();
335- ctx_params.n_threads_batch = backend_->get_num_threads ();
336- ctx_params.no_perf = true ;
330+ ctx_params.n_batch = safe_batch_size;
331+ ctx_params.n_ubatch = safe_batch_size;
337332
338333 context_ = llama_init_from_model (model_, ctx_params);
339334
@@ -802,11 +797,13 @@ bool LlamaCppTextGeneration::recreate_context() {
802797 context_ = nullptr ;
803798 }
804799
805- // Create new context (adapters are now visible to it)
800+ int max_safe_batch = 2048 ; // Configurable parameter
801+ int safe_batch_size = std::min (context_size_, max_safe_batch);
802+
806803 llama_context_params ctx_params = llama_context_default_params ();
807804 ctx_params.n_ctx = context_size_;
808- ctx_params.n_batch = context_size_ ;
809- ctx_params.n_ubatch = context_size_ ;
805+ ctx_params.n_batch = safe_batch_size ;
806+ ctx_params.n_ubatch = safe_batch_size ;
810807 ctx_params.n_threads = backend_->get_num_threads ();
811808 ctx_params.n_threads_batch = backend_->get_num_threads ();
812809 ctx_params.no_perf = true ;
0 commit comments