Skip to content

Commit ae5fea9

Browse files
Merge PR #473: Adopt llama_params_fit for model loading (rebased #444)
Replaces fragile filename-heuristic GPU-layer/context limiter in LlamaCppTextGeneration::load_model with llama.cpp's native llama_params_fit, which queries real device memory to pick n_gpu_layers and n_ctx, preventing OOM on large models. Includes CPU-only build fallback + user override path fixes. Original PR: #444 (rebased) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2 parents 59e8ff4 + ac2aef2 commit ae5fea9

1 file changed

Lines changed: 128 additions & 102 deletions

File tree

sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp

Lines changed: 128 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
#include <algorithm>
66
#include <chrono>
7+
#include <climits>
78
#include <cstring>
89
#include <string>
910
#include <vector>
@@ -239,70 +240,130 @@ bool LlamaCppTextGeneration::load_model(const std::string& model_path,
239240
model_params.use_mmap = false;
240241
#endif
241242

242-
// Detect model size from filename to set appropriate GPU layers BEFORE loading
243-
// This prevents OOM crashes on mobile devices with limited GPU memory
244-
// Note: We use filename heuristics here because we can't know param count until after loading
245-
std::string path_lower = model_path;
246-
std::transform(path_lower.begin(), path_lower.end(), path_lower.begin(), ::tolower);
247-
248-
int gpu_layers = -1; // Default: all layers to GPU
249-
250-
// Check for large model indicators in filename using word boundary detection
251-
// Patterns like "7b", "8b", "13b" should match at word boundaries to avoid
252-
// false positives like "/backup7b/" or "/2017beta/"
253-
auto is_model_size_marker = [&path_lower](const char* marker) {
254-
size_t pos = path_lower.find(marker);
255-
while (pos != std::string::npos) {
256-
// Check for word boundary before (start of string, or non-alphanumeric)
257-
bool valid_start = (pos == 0) || !std::isalnum(path_lower[pos - 1]);
258-
// Check for word boundary after (end of string, or non-alphanumeric except digits for patterns like "7b-q4")
259-
size_t end_pos = pos + strlen(marker);
260-
bool valid_end = (end_pos >= path_lower.size()) ||
261-
(!std::isalpha(path_lower[end_pos]) || path_lower[end_pos] == '-' || path_lower[end_pos] == '_');
262-
263-
if (valid_start && valid_end) {
264-
return true;
265-
}
266-
pos = path_lower.find(marker, pos + 1);
267-
}
268-
return false;
269-
};
243+
// If llama_params_fit aborts, use the user-provided value
244+
int user_gpu_layers = -1; // -1 = not set by user
245+
if (config.contains("gpu_layers")) {
246+
user_gpu_layers = config["gpu_layers"].get<int>();
247+
RAC_LOG_INFO("LLM.LlamaCpp", "User-provided GPU layers: %d (will apply after fit)", user_gpu_layers);
248+
}
270249

271-
// Detect large models (7B+) that may need GPU layer limiting on mobile
272-
// First check for config-based override (for custom-named models)
273-
bool is_large_model = false;
274-
if (config.contains("expected_params_billions")) {
275-
double expected_params = config["expected_params_billions"].get<double>();
276-
is_large_model = (expected_params >= kLargeModelThresholdB);
277-
if (is_large_model) {
278-
RAC_LOG_INFO("LLM.LlamaCpp","Large model detected from config (%.1fB expected params)", expected_params);
279-
}
250+
// Set up context params early for llama_params_fit
251+
llama_context_params ctx_params = llama_context_default_params();
252+
ctx_params.n_ctx = 0;
253+
ctx_params.n_threads = backend_->get_num_threads();
254+
ctx_params.n_threads_batch = backend_->get_num_threads();
255+
ctx_params.no_perf = true;
256+
257+
if (user_context_size > 0) {
258+
ctx_params.n_ctx = user_context_size;
259+
RAC_LOG_INFO("LLM.LlamaCpp", "User-provided context size: %d", user_context_size);
280260
}
281261

282-
// Fall back to filename heuristics if no config provided
283-
if (!is_large_model) {
284-
is_large_model = is_model_size_marker("7b") ||
285-
is_model_size_marker("8b") ||
286-
is_model_size_marker("9b") ||
287-
is_model_size_marker("13b") ||
288-
is_model_size_marker("70b");
262+
size_t n_devices = llama_max_devices();
263+
size_t n_overrides = llama_max_tensor_buft_overrides();
264+
265+
std::vector<float> tensor_split(n_devices, 0.0f);
266+
// llama.cpp iterates tensor_buft_overrides until it hits a zero-valued
267+
// sentinel entry (pattern == nullptr). Value-initializing the vector to
268+
// all zeros means the first element is already that sentinel, so an
269+
// empty vector is interpreted as "no tensor buft overrides."
270+
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides(n_overrides);
271+
std::vector<size_t> margins(n_devices, 0);
272+
273+
size_t margin_mib = 1024; // Configurable parameter
274+
if (config.contains("fit_margin_mib")) {
275+
margin_mib = config["fit_margin_mib"].get<size_t>();
276+
}
277+
for (size_t i = 0; i < n_devices; ++i) {
278+
margins[i] = margin_mib * 1024 * 1024;
289279
}
290280

291-
if (is_large_model) {
292-
// For 7B+ models on mobile: limit GPU layers to prevent OOM
293-
// Most 7B models have 32 layers, offload ~24 to GPU, rest to CPU
294-
gpu_layers = kLargeModelGpuLayers;
295-
RAC_LOG_INFO("LLM.LlamaCpp","Large model detected, limiting GPU layers to %d to prevent OOM", gpu_layers);
281+
uint32_t n_ctx_min = 2048; // Configurable parameter
282+
if (config.contains("n_ctx_min")) {
283+
n_ctx_min = config["n_ctx_min"].get<uint32_t>();
296284
}
297285

298-
// Allow user override via config
299-
if (config.contains("gpu_layers")) {
300-
gpu_layers = config["gpu_layers"].get<int>();
301-
RAC_LOG_INFO("LLM.LlamaCpp","Using user-provided GPU layers: %d", gpu_layers);
286+
RAC_LOG_INFO("LLM.LlamaCpp", "Calling llama_params_fit (margin=%zuMiB, n_ctx_min=%u, n_devices=%zu)",
287+
margin_mib, n_ctx_min, n_devices);
288+
289+
llama_params_fit_status fit_status = llama_params_fit(
290+
model_path.c_str(),
291+
&model_params,
292+
&ctx_params,
293+
tensor_split.data(),
294+
tensor_buft_overrides.data(),
295+
margins.data(),
296+
n_ctx_min,
297+
GGML_LOG_LEVEL_INFO
298+
);
299+
300+
switch (fit_status) {
301+
case LLAMA_PARAMS_FIT_STATUS_SUCCESS:
302+
RAC_LOG_INFO("LLM.LlamaCpp", "llama_params_fit SUCCESS: n_gpu_layers=%d, n_ctx=%u",
303+
model_params.n_gpu_layers, ctx_params.n_ctx);
304+
break;
305+
case LLAMA_PARAMS_FIT_STATUS_FAILURE:
306+
RAC_LOG_INFO("LLM.LlamaCpp", "llama_params_fit FAILURE: could not fit model to device memory. "
307+
"Proceeding with conservative CPU-only defaults.");
308+
model_params.n_gpu_layers = 0;
309+
if (user_context_size > 0 && user_context_size > 2048) {
310+
RAC_LOG_INFO("LLM.LlamaCpp", "Capping user-requested context_size=%d to 2048 after fit FAILURE",
311+
user_context_size);
312+
}
313+
if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 2048) {
314+
ctx_params.n_ctx = 2048;
315+
}
316+
break;
317+
case LLAMA_PARAMS_FIT_STATUS_ERROR:
318+
RAC_LOG_ERROR("LLM.LlamaCpp", "llama_params_fit ERROR for model: %s. "
319+
"Falling back to conservative CPU-only defaults.", model_path.c_str());
320+
model_params.n_gpu_layers = 0;
321+
if (user_context_size > 0 && user_context_size > 2048) {
322+
RAC_LOG_INFO("LLM.LlamaCpp", "Capping user-requested context_size=%d to 2048 after fit ERROR",
323+
user_context_size);
324+
}
325+
if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 2048) {
326+
ctx_params.n_ctx = 2048;
327+
}
328+
break;
329+
}
330+
331+
// Apply user gpu_layers override after fit, respecting the CPU-only build constraint.
332+
// llama_params_fit does not yet account for host memory in CPU-only builds
333+
// (upstream PR: https://github.com/ggml-org/llama.cpp/pull/19711).
334+
#if !defined(GGML_USE_METAL) && !defined(GGML_USE_CUDA) && !defined(GGML_USE_WEBGPU)
335+
if (fit_status == LLAMA_PARAMS_FIT_STATUS_SUCCESS) {
336+
RAC_LOG_INFO("LLM.LlamaCpp", "CPU-only build: llama_params_fit fitted to GPU memory but no GPU backend active. "
337+
"Applying conservative CPU defaults.");
338+
}
339+
if (user_gpu_layers > 0) {
340+
RAC_LOG_INFO("LLM.LlamaCpp", "CPU-only build: ignoring user gpu_layers=%d (no GPU backend available)",
341+
user_gpu_layers);
342+
}
343+
model_params.n_gpu_layers = 0;
344+
if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 4096) {
345+
ctx_params.n_ctx = 4096;
346+
RAC_LOG_INFO("LLM.LlamaCpp", "CPU-only: capping context to %u", ctx_params.n_ctx);
347+
}
348+
#else
349+
if (user_gpu_layers >= 0) {
350+
// llama_params_fit fell back to n_gpu_layers=0 for non-SUCCESS outcomes;
351+
// honouring the user override here reinstates the OOM risk the fit call
352+
// was supposed to prevent. Log a warning so it's visible in the event of
353+
// a subsequent crash/OOM, but keep honouring the user's explicit request.
354+
if (fit_status != LLAMA_PARAMS_FIT_STATUS_SUCCESS) {
355+
const char* fit_label =
356+
fit_status == LLAMA_PARAMS_FIT_STATUS_FAILURE ? "FAILURE" : "ERROR";
357+
RAC_LOG_WARNING("LLM.LlamaCpp",
358+
"Applying user gpu_layers=%d override despite llama_params_fit %s — risk of OOM",
359+
user_gpu_layers, fit_label);
360+
}
361+
model_params.n_gpu_layers = user_gpu_layers;
362+
RAC_LOG_INFO("LLM.LlamaCpp", "Applying user GPU layers override: %d", user_gpu_layers);
302363
}
364+
#endif
303365

304-
model_params.n_gpu_layers = gpu_layers;
305-
RAC_LOG_INFO("LLM.LlamaCpp","Loading model with n_gpu_layers=%d", gpu_layers);
366+
RAC_LOG_INFO("LLM.LlamaCpp", "Loading model with n_gpu_layers=%d", model_params.n_gpu_layers);
306367

307368
model_ = llama_model_load_from_file(model_path.c_str(), model_params);
308369

@@ -312,64 +373,30 @@ bool LlamaCppTextGeneration::load_model(const std::string& model_path,
312373
}
313374

314375
int model_train_ctx = llama_model_n_ctx_train(model_);
315-
RAC_LOG_INFO("LLM.LlamaCpp","Model training context size: %d", model_train_ctx);
316-
317-
// Get model parameter count to determine appropriate context size
318-
// Large models (7B+) need smaller context on mobile to fit in memory
319376
uint64_t n_params = llama_model_n_params(model_);
320377
double params_billions = static_cast<double>(n_params) / 1e9;
321-
RAC_LOG_INFO("LLM.LlamaCpp","Model parameters: %.2fB", params_billions);
322-
323-
// Post-load verification: warn if actual param count differs from filename heuristic
324-
bool actual_is_large = (params_billions >= kLargeModelThresholdB);
325-
if (actual_is_large && !is_large_model) {
326-
RAC_LOG_INFO("LLM.LlamaCpp","WARNING: Model has %.1fB params but filename didn't indicate large model. "
327-
"Consider using gpu_layers config for optimal performance.", params_billions);
328-
} else if (!actual_is_large && is_large_model) {
329-
RAC_LOG_INFO("LLM.LlamaCpp","NOTE: Filename suggested large model but actual params are %.1fB. "
330-
"GPU layer limiting may be conservative.", params_billions);
331-
}
332-
333-
// Adaptive context size based on model size for mobile devices
334-
int adaptive_max_context;
335-
if (params_billions >= kLargeModelThresholdB) {
336-
adaptive_max_context = kLargeModelContextSize;
337-
RAC_LOG_INFO("LLM.LlamaCpp","Large model detected (%.1fB params), limiting context to %d for memory", params_billions, adaptive_max_context);
338-
} else if (params_billions >= kMediumModelThresholdB) {
339-
adaptive_max_context = kMediumModelContextSize;
340-
RAC_LOG_INFO("LLM.LlamaCpp","Medium model detected (%.1fB params), limiting context to %d", params_billions, adaptive_max_context);
341-
} else if (params_billions >= kSmallModelThresholdB) {
342-
adaptive_max_context = kSmallModelContextSize;
343-
RAC_LOG_INFO("LLM.LlamaCpp","Small-medium model detected (%.1fB params), limiting context to %d", params_billions, adaptive_max_context);
344-
} else {
345-
// Tiny models (<1B): can use larger context
346-
adaptive_max_context = max_default_context_;
347-
}
378+
RAC_LOG_INFO("LLM.LlamaCpp", "Model loaded: %.2fB params, training context=%d", params_billions, model_train_ctx);
348379

349-
if (user_context_size > 0) {
350-
context_size_ = std::min(user_context_size, model_train_ctx);
351-
RAC_LOG_INFO("LLM.LlamaCpp","Using user-provided context size: %d (requested: %d, model max: %d)", context_size_,
352-
user_context_size, model_train_ctx);
353-
} else {
354-
context_size_ = std::min({model_train_ctx, max_default_context_, adaptive_max_context});
355-
RAC_LOG_INFO("LLM.LlamaCpp","Auto-detected context size: %d (model: %d, cap: %d, adaptive: %d)", context_size_,
356-
model_train_ctx, max_default_context_, adaptive_max_context);
380+
if (ctx_params.n_ctx == 0) {
381+
ctx_params.n_ctx = std::min(model_train_ctx, max_default_context_);
357382
}
383+
// ctx_params.n_ctx is uint32_t; clamp to INT_MAX before converting to int so
384+
// a pathological fitted/user value above ~2.1B can't wrap to a negative
385+
// number that `std::min` would then pick as the "smallest" context size.
386+
const int fitted_ctx = static_cast<int>(
387+
std::min(ctx_params.n_ctx, static_cast<uint32_t>(INT_MAX)));
388+
context_size_ = std::min({fitted_ctx, model_train_ctx, max_default_context_});
389+
390+
RAC_LOG_INFO("LLM.LlamaCpp", "Final context size: %d (fitted=%u, train=%d, cap=%d)",
391+
context_size_, ctx_params.n_ctx, model_train_ctx, max_default_context_);
358392

359-
// Cap batch sizes to avoid exceeding Metal's 4 GB single-buffer limit on iOS.
360-
// n_ctx controls conversation history length; n_batch/n_ubatch control how many
361-
// tokens are processed in one GPU pass. llama.cpp splits larger prompts automatically.
362393
static constexpr int MAX_BATCH_SIZE = 2048;
363394
static constexpr int MAX_UBATCH_SIZE = 512;
364395
batch_size_ = std::min(context_size_, MAX_BATCH_SIZE);
365396

366-
llama_context_params ctx_params = llama_context_default_params();
367397
ctx_params.n_ctx = context_size_;
368398
ctx_params.n_batch = batch_size_;
369399
ctx_params.n_ubatch = std::min(batch_size_, MAX_UBATCH_SIZE);
370-
ctx_params.n_threads = backend_->get_num_threads();
371-
ctx_params.n_threads_batch = backend_->get_num_threads();
372-
ctx_params.no_perf = true;
373400

374401
RAC_LOG_INFO("LLM.LlamaCpp", "Context params: n_ctx=%d, n_batch=%d, n_ubatch=%d",
375402
ctx_params.n_ctx, ctx_params.n_batch, ctx_params.n_ubatch);
@@ -1194,7 +1221,6 @@ bool LlamaCppTextGeneration::recreate_context() {
11941221
context_ = nullptr;
11951222
}
11961223

1197-
// Create new context (adapters are now visible to it)
11981224
llama_context_params ctx_params = llama_context_default_params();
11991225
ctx_params.n_ctx = context_size_;
12001226
ctx_params.n_batch = batch_size_;

0 commit comments

Comments
 (0)