Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3127,6 +3127,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.models_max = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
add_opt(common_arg(
{"--models-memory-margin"}, "N",
string_format("for router server, MiB of memory to leave free, per device (default: %d, 0 = unlimited)", params.models_memory_margin),
[](common_params & params, int value) {
params.models_memory_margin = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MEMORY_MARGIN"));
add_opt(common_arg(
{"--models-autoload"},
{"--no-models-autoload"},
Expand Down Expand Up @@ -3356,6 +3363,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.offline = true;
}
).set_env("LLAMA_ARG_OFFLINE"));
add_opt(common_arg(
{"--download-only"},
"Download the model file(s) and exit",
[](common_params & params) {
params.download_only = true;
}
));
add_opt(common_arg(
{"--measure-only"},
"Load the model to measure memory requirements, print to stdout, then exit",
[](common_params & params) {
params.measure_only = true;
}
));
add_opt(common_arg(
{"-lv", "--verbosity", "--log-verbosity"}, "N",
string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
Expand Down
11 changes: 7 additions & 4 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,8 @@ struct common_params {
int32_t control_vector_layer_end = -1; // layer range for control vector
bool offline = false;
bool skip_download = false; // skip model file downloading
bool download_only = false; // only download the model if required, don't start the server
bool measure_only = false; // load model with no_alloc to measure memory, print to stdout, then exit

int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
Expand Down Expand Up @@ -637,10 +639,11 @@ struct common_params {
std::vector<std::string> server_tools;

// router server configs
std::string models_dir = ""; // directory containing models for the router server
std::string models_preset = ""; // directory containing model presets for the router server
int models_max = 4; // maximum number of models to load simultaneously
bool models_autoload = true; // automatically load models when requested via the router server
std::string models_dir = ""; // directory containing models for the router server
std::string models_preset = ""; // directory containing model presets for the router server
int models_max = 4; // maximum number of models to load simultaneously
int models_memory_margin = 1024; // MiB of free memory to preserve per device (0 = disabled)
bool models_autoload = true; // automatically load models when requested via the router server

bool log_json = false;

Expand Down
Loading
Loading