diff --git a/common/arg.cpp b/common/arg.cpp
index 1302065498..1ef3278abb 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2139,7 +2139,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
     add_opt(common_arg(
         {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
-        string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
+        string_format("max. number of layers to store in VRAM, -1 is auto, <= -2 is all (default: %d)", params.n_gpu_layers),
         [](common_params & params, int value) {
             params.n_gpu_layers = value;
             if (!llama_supports_gpu_offload()) {
@@ -3177,7 +3177,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
         {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
-        "number of layers to store in VRAM for the draft model",
+        string_format("max. number of draft model layers to store in VRAM, -1 is auto, <= -2 is all (default: %d)", params.speculative.n_gpu_layers),
         [](common_params & params, int value) {
             params.speculative.n_gpu_layers = value;
             if (!llama_supports_gpu_offload()) {
diff --git a/common/common.cpp b/common/common.cpp
index d4e8c7405e..ffde088a9b 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1339,10 +1339,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
         mparams.devices = params.devices.data();
     }
 
-    if (params.n_gpu_layers != -1) {
-        mparams.n_gpu_layers = params.n_gpu_layers;
-    }
-
+    mparams.n_gpu_layers    = params.n_gpu_layers;
     mparams.main_gpu        = params.main_gpu;
     mparams.split_mode      = params.split_mode;
     mparams.tensor_split    = params.tensor_split;
diff --git a/common/common.h b/common/common.h
index 334372073a..f8bc686b6f 100644
--- a/common/common.h
+++ b/common/common.h
@@ -329,7 +329,7 @@ struct common_params {
     // offload params
     std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
 
-    int32_t n_gpu_layers       = -1;               // number of layers to store in VRAM (-1 - use default)
+    int32_t n_gpu_layers       = -1;               // number of layers to store in VRAM, -1 is auto, <= -2 is all
     int32_t main_gpu           = 0;                // the GPU that is used for scratch and small tensors
     float   tensor_split[128]  = {0};              // how split tensors should be distributed across GPUs
     bool    fit_params         = true;             // whether to fit unset model/context parameters to free device memory
diff --git a/include/llama.h b/include/llama.h
index f862930099..aa9f01c545 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -286,7 +286,7 @@ extern "C" {
         // NULL-terminated list of buffer types to use for tensors that match a pattern
         const struct llama_model_tensor_buft_override * tensor_buft_overrides;
 
-        int32_t n_gpu_layers; // number of layers to store in VRAM
+        int32_t n_gpu_layers; // number of layers to store in VRAM, a negative value means all layers
         enum llama_split_mode split_mode; // how to split the model across multiple GPUs
 
         // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
@@ -468,8 +468,9 @@ extern "C" {
     LLAMA_API void llama_free(struct llama_context * ctx);
 
     // fits mparams and cparams to free device memory (assumes system memory is unlimited)
-    // returns true if the parameters could be successfully modified to fit device memory
-    // this function is NOT thread safe because it modifies the global llama logger state
+    //   - returns true if the parameters could be successfully modified to fit device memory
+    //   - this function is NOT thread safe because it modifies the global llama logger state
+    //   - only parameters that have the same value as in llama_default_model_params are modified
     LLAMA_API bool llama_params_fit(
                                    const char   * path_model,
                     struct llama_model_params   * mparams,
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 8786d4ee3e..6ead268205 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -294,7 +294,7 @@ llama_context::llama_context(
         // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
         bool pipeline_parallel =
             model.n_devices() > 1 &&
-            model.params.n_gpu_layers > (int) model.hparams.n_layer &&
+            uint32_t(model.params.n_gpu_layers) > model.hparams.n_layer &&
             model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
             cparams.offload_kqv &&
             !model.has_tensor_overrides();
@@ -1571,7 +1571,7 @@ llm_graph_cb llama_context::graph_get_cb() const {
 
         // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
         // FIXME: fix in ggml_backend_sched
-        const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer;
+        const bool full_offload = uint32_t(model.params.n_gpu_layers) > model.hparams.n_layer;
         if (ubatch.n_tokens < 32 || full_offload) {
             if (il != -1 && strcmp(name, "norm") == 0) {
                 const auto & dev_layer = model.dev_layer(il);
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index d2270e8f2d..e103094393 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2329,11 +2329,11 @@ void llama_model::load_vocab(llama_model_loader & ml) {
 
 bool llama_model::load_tensors(llama_model_loader & ml) {
     const auto & split_mode   = params.split_mode;
-    const auto & n_gpu_layers = params.n_gpu_layers;
     const auto & use_mlock    = params.use_mlock;
     const auto & tensor_split = params.tensor_split;
 
-    const int n_layer = hparams.n_layer;
+    const int n_layer      = hparams.n_layer;
+    const int n_gpu_layers = params.n_gpu_layers >= 0 ? params.n_gpu_layers : n_layer + 1;
 
     const bool use_mmap_buffer = true;
 
@@ -7662,7 +7662,7 @@ llama_model_params llama_model_default_params() {
     llama_model_params result = {
         /*.devices                     =*/ nullptr,
         /*.tensor_buft_overrides       =*/ nullptr,
-        /*.n_gpu_layers                =*/ 999,
+        /*.n_gpu_layers                =*/ -1,
         /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
         /*.main_gpu                    =*/ 0,
         /*.tensor_split                =*/ nullptr,
diff --git a/src/llama.cpp b/src/llama.cpp
index 1e18637e36..433dfd3a5d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -181,7 +181,7 @@ static void llama_params_fit_impl(
         }
     }
 
-    int64_t sum_total           = 0;
+    int64_t sum_free            = 0;
     int64_t sum_projected_free  = 0;
     int64_t min_projected_free  = INT64_MAX;
     int64_t sum_projected_used  = 0;
@@ -197,7 +197,7 @@ static void llama_params_fit_impl(
         const int64_t projected_used = dmd.mb.total();
         const int64_t projected_free = dmd.free - projected_used;
 
-        sum_total           += dmd.total;
+        sum_free            += dmd.free;
         sum_projected_used  += projected_used;
         sum_projected_free  += projected_free;
         min_projected_free   = std::min(min_projected_free, projected_free);
@@ -210,10 +210,10 @@ static void llama_params_fit_impl(
                 projected_free >= 0 ? "surplus" : "deficit");
         }
     }
-    assert(sum_total >= 0 && sum_projected_used >= 0 && sum_projected_ctx >= 0);
+    assert(sum_free >= 0 && sum_projected_used >= 0 && sum_projected_ctx >= 0);
     assert(sum_projected_used >= sum_projected_ctx);
     LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
-        __func__, sum_projected_used/MiB, sum_total/MiB);
+        __func__, sum_projected_used/MiB, sum_free/MiB);
     if (min_projected_free >= margin) {
         if (nd == 1) {
             LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",