Skip to content

Commit 8508673

Browse files
committed
Quick Model Loading fix
Fixes model loading issue due to huge buffer allocation.
1 parent 7ed42a7 commit 8508673

1 file changed

Lines changed: 103 additions & 106 deletions

File tree

sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp

Lines changed: 103 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -207,70 +207,101 @@ bool LlamaCppTextGeneration::load_model(const std::string& model_path,
207207
model_params.use_mmap = false;
208208
#endif
209209

210-
// Detect model size from filename to set appropriate GPU layers BEFORE loading
211-
// This prevents OOM crashes on mobile devices with limited GPU memory
212-
// Note: We use filename heuristics here because we can't know param count until after loading
213-
std::string path_lower = model_path;
214-
std::transform(path_lower.begin(), path_lower.end(), path_lower.begin(), ::tolower);
215-
216-
int gpu_layers = -1; // Default: all layers to GPU
217-
218-
// Check for large model indicators in filename using word boundary detection
219-
// Patterns like "7b", "8b", "13b" should match at word boundaries to avoid
220-
// false positives like "/backup7b/" or "/2017beta/"
221-
auto is_model_size_marker = [&path_lower](const char* marker) {
222-
size_t pos = path_lower.find(marker);
223-
while (pos != std::string::npos) {
224-
// Check for word boundary before (start of string, or non-alphanumeric)
225-
bool valid_start = (pos == 0) || !std::isalnum(path_lower[pos - 1]);
226-
// Check for word boundary after (end of string, or non-alphanumeric except digits for patterns like "7b-q4")
227-
size_t end_pos = pos + strlen(marker);
228-
bool valid_end = (end_pos >= path_lower.size()) ||
229-
(!std::isalpha(path_lower[end_pos]) || path_lower[end_pos] == '-' || path_lower[end_pos] == '_');
230-
231-
if (valid_start && valid_end) {
232-
return true;
233-
}
234-
pos = path_lower.find(marker, pos + 1);
235-
}
236-
return false;
237-
};
210+
// If llama_params_fits aborts, use the user provided value
211+
int user_gpu_layers = -1; // -1 = not set by user
212+
if (config.contains("gpu_layers")) {
213+
user_gpu_layers = config["gpu_layers"].get<int>();
214+
LOGI("User-provided GPU layers: %d (will apply after fit)", user_gpu_layers);
215+
}
238216

239-
// Detect large models (7B+) that may need GPU layer limiting on mobile
240-
// First check for config-based override (for custom-named models)
241-
bool is_large_model = false;
242-
if (config.contains("expected_params_billions")) {
243-
double expected_params = config["expected_params_billions"].get<double>();
244-
is_large_model = (expected_params >= 7.0);
245-
if (is_large_model) {
246-
LOGI("Large model detected from config (%.1fB expected params)", expected_params);
247-
}
217+
// Set up context params early for llama_params_fit
218+
llama_context_params ctx_params = llama_context_default_params();
219+
ctx_params.n_ctx = 0;
220+
ctx_params.n_threads = backend_->get_num_threads();
221+
ctx_params.n_threads_batch = backend_->get_num_threads();
222+
ctx_params.no_perf = true;
223+
224+
if (user_context_size > 0) {
225+
ctx_params.n_ctx = user_context_size;
226+
LOGI("User-provided context size: %d", user_context_size);
248227
}
249228

250-
// Fall back to filename heuristics if no config provided
251-
if (!is_large_model) {
252-
is_large_model = is_model_size_marker("7b") ||
253-
is_model_size_marker("8b") ||
254-
is_model_size_marker("9b") ||
255-
is_model_size_marker("13b") ||
256-
is_model_size_marker("70b");
229+
size_t n_devices = llama_max_devices();
230+
size_t n_overrides = llama_max_tensor_buft_overrides();
231+
232+
std::vector<float> tensor_split(n_devices, 0.0f);
233+
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides(n_overrides);
234+
std::vector<size_t> margins(n_devices, 0);
235+
236+
size_t margin_mib = 1024; // Configurable parameter
237+
if (config.contains("fit_margin_mib")) {
238+
margin_mib = config["fit_margin_mib"].get<size_t>();
239+
}
240+
for (size_t i = 0; i < n_devices; ++i) {
241+
margins[i] = margin_mib * 1024 * 1024;
257242
}
258243

259-
if (is_large_model) {
260-
// For 7B+ models on mobile: limit GPU layers to prevent OOM
261-
// Most 7B models have 32 layers, offload ~24 to GPU, rest to CPU
262-
gpu_layers = 24;
263-
LOGI("Large model detected, limiting GPU layers to %d to prevent OOM", gpu_layers);
244+
uint32_t n_ctx_min = 2048; // Configurable parameter
245+
246+
LOGI("Calling llama_params_fit (margin=%zuMiB, n_ctx_min=%u, n_devices=%zu)",
247+
margin_mib, n_ctx_min, n_devices);
248+
249+
llama_params_fit_status fit_status = llama_params_fit(
250+
model_path.c_str(),
251+
&model_params,
252+
&ctx_params,
253+
tensor_split.data(),
254+
tensor_buft_overrides.data(),
255+
margins.data(),
256+
n_ctx_min,
257+
GGML_LOG_LEVEL_INFO
258+
);
259+
260+
switch (fit_status) {
261+
case LLAMA_PARAMS_FIT_STATUS_SUCCESS:
262+
LOGI("llama_params_fit SUCCESS: n_gpu_layers=%d, n_ctx=%u",
263+
model_params.n_gpu_layers, ctx_params.n_ctx);
264+
break;
265+
case LLAMA_PARAMS_FIT_STATUS_FAILURE:
266+
LOGI("llama_params_fit FAILURE: could not fit model to device memory. "
267+
"Proceeding with conservative CPU-only defaults.");
268+
model_params.n_gpu_layers = 0;
269+
if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 2048) {
270+
ctx_params.n_ctx = 2048;
271+
}
272+
break;
273+
case LLAMA_PARAMS_FIT_STATUS_ERROR:
274+
LOGE("llama_params_fit ERROR for model: %s. "
275+
"Falling back to conservative CPU-only defaults.", model_path.c_str());
276+
model_params.n_gpu_layers = 0;
277+
if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 2048) {
278+
ctx_params.n_ctx = 2048;
279+
}
280+
break;
264281
}
265282

266-
// Allow user override via config
267-
if (config.contains("gpu_layers")) {
268-
gpu_layers = config["gpu_layers"].get<int>();
269-
LOGI("Using user-provided GPU layers: %d", gpu_layers);
283+
// Apply user gpu_layers override after fit
284+
if (user_gpu_layers >= 0) {
285+
model_params.n_gpu_layers = user_gpu_layers;
286+
LOGI("Applying user GPU layers override: %d", user_gpu_layers);
287+
}
288+
289+
// Currently llama_params_fit does not detect cpu only memory
290+
// They have an ongoing pr, which when merged, should solve our problem.
291+
// this should work as a placeholder until then.
292+
#if !defined(GGML_USE_METAL) && !defined(GGML_USE_CUDA) && !defined(GGML_USE_WEBGPU)
293+
if (fit_status == LLAMA_PARAMS_FIT_STATUS_SUCCESS) {
294+
LOGI("CPU-only build: llama_params_fit fitted to GPU memory but no GPU backend active. "
295+
"Applying conservative CPU defaults.");
296+
}
297+
model_params.n_gpu_layers = 0;
298+
if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 4096) {
299+
ctx_params.n_ctx = 4096;
300+
LOGI("CPU-only: capping context to %u", ctx_params.n_ctx);
270301
}
302+
#endif
271303

272-
model_params.n_gpu_layers = gpu_layers;
273-
LOGI("Loading model with n_gpu_layers=%d", gpu_layers);
304+
LOGI("Loading model with n_gpu_layers=%d", model_params.n_gpu_layers);
274305

275306
model_ = llama_model_load_from_file(model_path.c_str(), model_params);
276307

@@ -280,60 +311,24 @@ bool LlamaCppTextGeneration::load_model(const std::string& model_path,
280311
}
281312

282313
int model_train_ctx = llama_model_n_ctx_train(model_);
283-
LOGI("Model training context size: %d", model_train_ctx);
284-
285-
// Get model parameter count to determine appropriate context size
286-
// Large models (7B+) need smaller context on mobile to fit in memory
287314
uint64_t n_params = llama_model_n_params(model_);
288315
double params_billions = static_cast<double>(n_params) / 1e9;
289-
LOGI("Model parameters: %.2fB", params_billions);
290-
291-
// Post-load verification: warn if actual param count differs from filename heuristic
292-
bool actual_is_large = (params_billions >= 7.0);
293-
if (actual_is_large && !is_large_model) {
294-
LOGI("WARNING: Model has %.1fB params but filename didn't indicate large model. "
295-
"Consider using gpu_layers config for optimal performance.", params_billions);
296-
} else if (!actual_is_large && is_large_model) {
297-
LOGI("NOTE: Filename suggested large model but actual params are %.1fB. "
298-
"GPU layer limiting may be conservative.", params_billions);
299-
}
300-
301-
// Adaptive context size based on model size for mobile devices
302-
int adaptive_max_context;
303-
if (params_billions >= 7.0) {
304-
// 7B+ models: use 2048 context to fit in ~6GB GPU memory
305-
adaptive_max_context = 2048;
306-
LOGI("Large model detected (%.1fB params), limiting context to %d for memory", params_billions, adaptive_max_context);
307-
} else if (params_billions >= 3.0) {
308-
// 3-7B models: use 4096 context
309-
adaptive_max_context = 4096;
310-
LOGI("Medium model detected (%.1fB params), limiting context to %d", params_billions, adaptive_max_context);
311-
} else if (params_billions >= 1.0) {
312-
// 1-3B models: use 2048 context (higher values OOM on mobile, especially with LoRA)
313-
adaptive_max_context = 2048;
314-
LOGI("Small-medium model detected (%.1fB params), limiting context to %d", params_billions, adaptive_max_context);
315-
} else {
316-
// Tiny models (<1B): can use larger context
317-
adaptive_max_context = max_default_context_;
318-
}
316+
LOGI("Model loaded: %.2fB params, training context=%d", params_billions, model_train_ctx);
319317

320-
if (user_context_size > 0) {
321-
context_size_ = std::min(user_context_size, model_train_ctx);
322-
LOGI("Using user-provided context size: %d (requested: %d, model max: %d)", context_size_,
323-
user_context_size, model_train_ctx);
324-
} else {
325-
context_size_ = std::min({model_train_ctx, max_default_context_, adaptive_max_context});
326-
LOGI("Auto-detected context size: %d (model: %d, cap: %d, adaptive: %d)", context_size_,
327-
model_train_ctx, max_default_context_, adaptive_max_context);
318+
if (ctx_params.n_ctx == 0) {
319+
ctx_params.n_ctx = std::min(model_train_ctx, max_default_context_);
328320
}
321+
context_size_ = std::min({(int)ctx_params.n_ctx, model_train_ctx, max_default_context_});
322+
323+
LOGI("Final context size: %d (fitted=%u, train=%d, cap=%d)",
324+
context_size_, ctx_params.n_ctx, model_train_ctx, max_default_context_);
325+
326+
int max_safe_batch = 2048; // Configurable parameter
327+
int safe_batch_size = std::min(context_size_, max_safe_batch);
329328

330-
llama_context_params ctx_params = llama_context_default_params();
331329
ctx_params.n_ctx = context_size_;
332-
ctx_params.n_batch = context_size_; // Allow processing full prompt at once
333-
ctx_params.n_ubatch = context_size_; // Physical batch size must also match
334-
ctx_params.n_threads = backend_->get_num_threads();
335-
ctx_params.n_threads_batch = backend_->get_num_threads();
336-
ctx_params.no_perf = true;
330+
ctx_params.n_batch = safe_batch_size;
331+
ctx_params.n_ubatch = safe_batch_size;
337332

338333
context_ = llama_init_from_model(model_, ctx_params);
339334

@@ -802,11 +797,13 @@ bool LlamaCppTextGeneration::recreate_context() {
802797
context_ = nullptr;
803798
}
804799

805-
// Create new context (adapters are now visible to it)
800+
int max_safe_batch = 2048; // Configurable parameter
801+
int safe_batch_size = std::min(context_size_, max_safe_batch);
802+
806803
llama_context_params ctx_params = llama_context_default_params();
807804
ctx_params.n_ctx = context_size_;
808-
ctx_params.n_batch = context_size_;
809-
ctx_params.n_ubatch = context_size_;
805+
ctx_params.n_batch = safe_batch_size;
806+
ctx_params.n_ubatch = safe_batch_size;
810807
ctx_params.n_threads = backend_->get_num_threads();
811808
ctx_params.n_threads_batch = backend_->get_num_threads();
812809
ctx_params.no_perf = true;

0 commit comments

Comments
 (0)