diff --git a/common/arg.cpp b/common/arg.cpp index 1302065498..1ef3278abb 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2139,7 +2139,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT")); add_opt(common_arg( {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", - string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers), + string_format("max. number of layers to store in VRAM, -1 is auto, <= -2 is all (default: %d)", params.n_gpu_layers), [](common_params & params, int value) { params.n_gpu_layers = value; if (!llama_supports_gpu_offload()) { @@ -3177,7 +3177,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", - "number of layers to store in VRAM for the draft model", + string_format("max. number of draft model layers to store in VRAM, -1 is auto, <= -2 is all (default: %d)", params.speculative.n_gpu_layers), [](common_params & params, int value) { params.speculative.n_gpu_layers = value; if (!llama_supports_gpu_offload()) { diff --git a/common/common.cpp b/common/common.cpp index d4e8c7405e..ffde088a9b 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1339,10 +1339,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { mparams.devices = params.devices.data(); } - if (params.n_gpu_layers != -1) { - mparams.n_gpu_layers = params.n_gpu_layers; - } - + mparams.n_gpu_layers = params.n_gpu_layers; mparams.main_gpu = params.main_gpu; mparams.split_mode = params.split_mode; mparams.tensor_split = params.tensor_split; diff --git a/common/common.h b/common/common.h index 334372073a..f8bc686b6f 100644 --- a/common/common.h +++ b/common/common.h @@ -329,7 +329,7 @@ struct common_params { // offload params std::vector devices; // devices to use for offloading - int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) + int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs bool fit_params = true; // whether to fit unset model/context parameters to free device memory diff --git a/include/llama.h b/include/llama.h index f862930099..aa9f01c545 100644 --- a/include/llama.h +++ b/include/llama.h @@ -286,7 +286,7 @@ extern "C" { // NULL-terminated list of buffer types to use for tensors that match a pattern const struct llama_model_tensor_buft_override * tensor_buft_overrides; - int32_t n_gpu_layers; // number of layers to store in VRAM + int32_t n_gpu_layers; // number of layers to store in VRAM, a negative value means all layers enum llama_split_mode split_mode; // how to split the model across multiple GPUs // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE @@ -468,8 +468,9 @@ extern "C" { LLAMA_API void llama_free(struct llama_context * ctx); // fits mparams and cparams to free device memory (assumes system memory is unlimited) - // returns true if the parameters could be successfully modified to fit device memory - // this function is NOT thread safe because it modifies the global llama logger state + // - returns true if the parameters could be successfully modified to fit device memory + // - this function is NOT thread safe because it modifies the global llama logger state + // - only parameters that have the same value as in llama_default_model_params are modified LLAMA_API bool llama_params_fit( const char * path_model, struct llama_model_params * mparams, diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 8786d4ee3e..6ead268205 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -294,7 +294,7 @@ llama_context::llama_context( // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary bool pipeline_parallel = model.n_devices() > 1 && - model.params.n_gpu_layers > (int) model.hparams.n_layer && + uint32_t(model.params.n_gpu_layers) > model.hparams.n_layer && model.params.split_mode == LLAMA_SPLIT_MODE_LAYER && cparams.offload_kqv && !model.has_tensor_overrides(); @@ -1571,7 +1571,7 @@ llm_graph_cb llama_context::graph_get_cb() const { // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends // FIXME: fix in ggml_backend_sched - const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer; + const bool full_offload = uint32_t(model.params.n_gpu_layers) > model.hparams.n_layer; if (ubatch.n_tokens < 32 || full_offload) { if (il != -1 && strcmp(name, "norm") == 0) { const auto & dev_layer = model.dev_layer(il); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index d2270e8f2d..e103094393 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2329,11 +2329,11 @@ void llama_model::load_vocab(llama_model_loader & ml) { bool llama_model::load_tensors(llama_model_loader & ml) { const auto & split_mode = params.split_mode; - const auto & n_gpu_layers = params.n_gpu_layers; const auto & use_mlock = params.use_mlock; const auto & tensor_split = params.tensor_split; - const int n_layer = hparams.n_layer; + const int n_layer = hparams.n_layer; + const int n_gpu_layers = params.n_gpu_layers >= 0 ? params.n_gpu_layers : n_layer + 1; const bool use_mmap_buffer = true; @@ -7662,7 +7662,7 @@ llama_model_params llama_model_default_params() { llama_model_params result = { /*.devices =*/ nullptr, /*.tensor_buft_overrides =*/ nullptr, - /*.n_gpu_layers =*/ 999, + /*.n_gpu_layers =*/ -1, /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, /*.main_gpu =*/ 0, /*.tensor_split =*/ nullptr, diff --git a/src/llama.cpp b/src/llama.cpp index 1e18637e36..433dfd3a5d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -181,7 +181,7 @@ static void llama_params_fit_impl( } } - int64_t sum_total = 0; + int64_t sum_free = 0; int64_t sum_projected_free = 0; int64_t min_projected_free = INT64_MAX; int64_t sum_projected_used = 0; @@ -197,7 +197,7 @@ static void llama_params_fit_impl( const int64_t projected_used = dmd.mb.total(); const int64_t projected_free = dmd.free - projected_used; - sum_total += dmd.total; + sum_free += dmd.free; sum_projected_used += projected_used; sum_projected_free += projected_free; min_projected_free = std::min(min_projected_free, projected_free); @@ -210,10 +210,10 @@ static void llama_params_fit_impl( projected_free >= 0 ? "surplus" : "deficit"); } } - assert(sum_total >= 0 && sum_projected_used >= 0 && sum_projected_ctx >= 0); + assert(sum_free >= 0 && sum_projected_used >= 0 && sum_projected_ctx >= 0); assert(sum_projected_used >= sum_projected_ctx); LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n", - __func__, sum_projected_used/MiB, sum_total/MiB); + __func__, sum_projected_used/MiB, sum_free/MiB); if (min_projected_free >= margin) { if (nd == 1) { LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",