Skip to content

Commit 1eb7542

Browse files
Fix Greptile issues on top of llama_params_fit integration
Four follow-ups to the llama_params_fit adoption from PR #444: 1. CPU-only build no longer silently drops user gpu_layers override. Previously, the CPU-only preprocessor block ran *after* the user override and unconditionally set n_gpu_layers=0, discarding the config value with no log. Now the override is scoped to the GPU-build branch; the CPU-only branch logs a warning when a non-zero user value is being ignored because no GPU backend is available. 2. n_ctx_min is now read from config["n_ctx_min"] (the inline comment "Configurable parameter" previously didn't match behavior). 3. When llama_params_fit returns FAILURE or ERROR and the user's requested context_size is larger than 2048, we now log that it's being capped, instead of silently clamping. 4. Restore trailing newline at EOF (was stripped by the PR).
1 parent e00c21d commit 1eb7542

1 file changed

Lines changed: 24 additions & 10 deletions

File tree

sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,9 @@ bool LlamaCppTextGeneration::load_model(const std::string& model_path,
274274
}
275275

276276
uint32_t n_ctx_min = 2048; // Configurable parameter
277+
if (config.contains("n_ctx_min")) {
278+
n_ctx_min = config["n_ctx_min"].get<uint32_t>();
279+
}
277280

278281
RAC_LOG_INFO("LLM.LlamaCpp", "Calling llama_params_fit (margin=%zuMiB, n_ctx_min=%u, n_devices=%zu)",
279282
margin_mib, n_ctx_min, n_devices);
@@ -298,6 +301,10 @@ bool LlamaCppTextGeneration::load_model(const std::string& model_path,
298301
RAC_LOG_INFO("LLM.LlamaCpp", "llama_params_fit FAILURE: could not fit model to device memory. "
299302
"Proceeding with conservative CPU-only defaults.");
300303
model_params.n_gpu_layers = 0;
304+
if (user_context_size > 0 && user_context_size > 2048) {
305+
RAC_LOG_INFO("LLM.LlamaCpp", "Capping user-requested context_size=%d to 2048 after fit FAILURE",
306+
user_context_size);
307+
}
301308
if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 2048) {
302309
ctx_params.n_ctx = 2048;
303310
}
@@ -306,31 +313,38 @@ bool LlamaCppTextGeneration::load_model(const std::string& model_path,
306313
RAC_LOG_ERROR("LLM.LlamaCpp", "llama_params_fit ERROR for model: %s. "
307314
"Falling back to conservative CPU-only defaults.", model_path.c_str());
308315
model_params.n_gpu_layers = 0;
316+
if (user_context_size > 0 && user_context_size > 2048) {
317+
RAC_LOG_INFO("LLM.LlamaCpp", "Capping user-requested context_size=%d to 2048 after fit ERROR",
318+
user_context_size);
319+
}
309320
if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 2048) {
310321
ctx_params.n_ctx = 2048;
311322
}
312323
break;
313324
}
314325

315-
// Apply user gpu_layers override after fit
316-
if (user_gpu_layers >= 0) {
317-
model_params.n_gpu_layers = user_gpu_layers;
318-
RAC_LOG_INFO("LLM.LlamaCpp", "Applying user GPU layers override: %d", user_gpu_layers);
319-
}
320-
321-
// Currently llama_params_fit does not detect cpu only memory
322-
// There is an ongoing upstream PR (https://github.com/ggml-org/llama.cpp/pull/19711)
323-
// that will solve this; until then, force CPU-only defaults.
326+
// Apply user gpu_layers override after fit, respecting the CPU-only build constraint.
327+
// llama_params_fit does not yet account for host memory in CPU-only builds
328+
// (upstream PR: https://github.com/ggml-org/llama.cpp/pull/19711).
324329
#if !defined(GGML_USE_METAL) && !defined(GGML_USE_CUDA) && !defined(GGML_USE_WEBGPU)
325330
if (fit_status == LLAMA_PARAMS_FIT_STATUS_SUCCESS) {
326331
RAC_LOG_INFO("LLM.LlamaCpp", "CPU-only build: llama_params_fit fitted to GPU memory but no GPU backend active. "
327332
"Applying conservative CPU defaults.");
328333
}
334+
if (user_gpu_layers > 0) {
335+
RAC_LOG_INFO("LLM.LlamaCpp", "CPU-only build: ignoring user gpu_layers=%d (no GPU backend available)",
336+
user_gpu_layers);
337+
}
329338
model_params.n_gpu_layers = 0;
330339
if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 4096) {
331340
ctx_params.n_ctx = 4096;
332341
RAC_LOG_INFO("LLM.LlamaCpp", "CPU-only: capping context to %u", ctx_params.n_ctx);
333342
}
343+
#else
344+
if (user_gpu_layers >= 0) {
345+
model_params.n_gpu_layers = user_gpu_layers;
346+
RAC_LOG_INFO("LLM.LlamaCpp", "Applying user GPU layers override: %d", user_gpu_layers);
347+
}
334348
#endif
335349

336350
RAC_LOG_INFO("LLM.LlamaCpp", "Loading model with n_gpu_layers=%d", model_params.n_gpu_layers);
@@ -1363,4 +1377,4 @@ nlohmann::json LlamaCppTextGeneration::get_lora_info() const {
13631377
return adapters;
13641378
}
13651379

1366-
} // namespace runanywhere
1380+
} // namespace runanywhere

0 commit comments

Comments
 (0)