From fd76a58f0ce8b535a789183660b0cc9bd3df8d52 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Wed, 25 Mar 2026 15:57:23 +0800 Subject: [PATCH 1/2] llama-model-loader: use pinned memory for tensor overrides --- src/llama-model-loader.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 413f34c2268..ebc2949561c 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -1117,6 +1117,7 @@ struct ggml_tensor * llama_model_loader::create_tensor( } ggml_backend_buffer_type_t buft = nullptr; + bool buft_from_override = false; // check overrides if (tensor_buft_overrides) { @@ -1130,6 +1131,7 @@ struct ggml_tensor * llama_model_loader::create_tensor( } else { buft = overrides->buft; } + buft_from_override = true; LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n", tensor_name.c_str(), @@ -1148,8 +1150,9 @@ struct ggml_tensor * llama_model_loader::create_tensor( } // avoid using a host buffer when using mmap + // but keep host buffers for overridden tensors - they need the host buffer for pinned memory auto * buft_dev = ggml_backend_buft_get_device(buft); - if (use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) { + if (use_mmap && !buft_from_override && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) { auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); if (!cpu_dev) { throw std::runtime_error("no CPU backend found"); From 53776225540badad624f2d5d5787f6c2c912c0b3 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sat, 28 Mar 2026 12:49:48 +0800 Subject: [PATCH 2/2] change to warning --- src/llama-model-loader.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index ebc2949561c..9ac10a969ef 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -1117,7 +1117,6 @@ struct ggml_tensor * llama_model_loader::create_tensor( } ggml_backend_buffer_type_t buft = nullptr; - bool buft_from_override = false; // check overrides if (tensor_buft_overrides) { @@ -1128,10 +1127,15 @@ struct ggml_tensor * llama_model_loader::create_tensor( if (overrides->buft == ggml_backend_cpu_buffer_type()) { // when overriding to a CPU buffer, consider the extra buffer types buft = select_weight_buft(hparams, t_meta, op, buft_list_cpu); + if (use_mmap) { + static std::once_flag once; + std::call_once(once, [] { + LLAMA_LOG_WARN("llama_model_loader: tensor overrides to CPU are used with mmap enabled - consider using --no-mmap for better performance\n"); + }); + } } else { buft = overrides->buft; } - buft_from_override = true; LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n", tensor_name.c_str(), @@ -1150,9 +1154,8 @@ struct ggml_tensor * llama_model_loader::create_tensor( } // avoid using a host buffer when using mmap - // but keep host buffers for overridden tensors - they need the host buffer for pinned memory auto * buft_dev = ggml_backend_buft_get_device(buft); - if (use_mmap && !buft_from_override && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) { + if (use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) { auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); if (!cpu_dev) { throw std::runtime_error("no CPU backend found");