ggml-org · 0cc4m · Mar 29, 2026 · Mar 29, 2026 · Mar 31, 2026 · Mar 31, 2026
@@ -3127,6 +3127,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.models_max = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
+    add_opt(common_arg(
+        {"--models-memory-margin"}, "N",
+        string_format("for router server, MiB of memory to leave free, per device (default: %d, 0 = unlimited)", params.models_memory_margin),
+        [](common_params & params, int value) {
+            params.models_memory_margin = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MEMORY_MARGIN"));
     add_opt(common_arg(
         {"--models-autoload"},
         {"--no-models-autoload"},
@@ -3356,6 +3363,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.offline = true;
         }
     ).set_env("LLAMA_ARG_OFFLINE"));
+    add_opt(common_arg(
+        {"--download-only"},
+        "Download the model file(s) and exit",
+        [](common_params & params) {
+            params.download_only = true;
+        }
+    ));
+    add_opt(common_arg(
+        {"--measure-only"},
+        "Load the model to measure memory requirements, print to stdout, then exit",
+        [](common_params & params) {
+            params.measure_only = true;
+        }
+    ));
     add_opt(common_arg(
         {"-lv", "--verbosity", "--log-verbosity"}, "N",
         string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"

@@ -508,6 +508,8 @@ struct common_params {
     int32_t control_vector_layer_end   = -1; // layer range for control vector
     bool    offline                    = false;
     bool    skip_download              = false; // skip model file downloading
+    bool    download_only              = false; // only download the model if required, don't start the server
+    bool    measure_only               = false; // load model with no_alloc to measure memory, print to stdout, then exit
 
     int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
     int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
@@ -637,10 +639,11 @@ struct common_params {
     std::vector<std::string> server_tools;
 
     // router server configs
-    std::string models_dir    = ""; // directory containing models for the router server
-    std::string models_preset = ""; // directory containing model presets for the router server
-    int models_max = 4;             // maximum number of models to load simultaneously
-    bool models_autoload = true;    // automatically load models when requested via the router server
+    std::string models_dir    = "";  // directory containing models for the router server
+    std::string models_preset = "";  // directory containing model presets for the router server
+    int models_max = 4;              // maximum number of models to load simultaneously
+    int models_memory_margin = 1024; // MiB of free memory to preserve per device (0 = disabled)
+    bool models_autoload = true;     // automatically load models when requested via the router server
 
     bool log_json = false;