Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3072,6 +3072,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.models_max = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
add_opt(common_arg(
{"--models-memory-margin"}, "N",
string_format("for router server, MiB of memory to leave free, per device (default: %d, 0 = unlimited)", params.models_memory_margin),
[](common_params & params, int value) {
params.models_memory_margin = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MEMORY_MARGIN"));
add_opt(common_arg(
{"--models-autoload"},
{"--no-models-autoload"},
Expand Down Expand Up @@ -3301,6 +3308,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.offline = true;
}
).set_env("LLAMA_OFFLINE"));
add_opt(common_arg(
{"--download-only"},
"Download the model file(s) and exit",
[](common_params & params) {
params.download_only = true;
}
));
add_opt(common_arg(
{"-lv", "--verbosity", "--log-verbosity"}, "N",
string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
Expand Down
10 changes: 6 additions & 4 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,7 @@ struct common_params {
int32_t control_vector_layer_start = -1; // layer range for control vector
int32_t control_vector_layer_end = -1; // layer range for control vector
bool offline = false;
bool download_only = false; // only download the model if required, don't start the server

int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
Expand Down Expand Up @@ -607,10 +608,11 @@ struct common_params {
std::vector<std::string> server_tools;

// router server configs
std::string models_dir = ""; // directory containing models for the router server
std::string models_preset = ""; // directory containing model presets for the router server
int models_max = 4; // maximum number of models to load simultaneously
bool models_autoload = true; // automatically load models when requested via the router server
std::string models_dir = ""; // directory containing models for the router server
std::string models_preset = ""; // directory containing model presets for the router server
int models_max = 4; // maximum number of models to load simultaneously
int models_memory_margin = 1024; // MiB of free memory to preserve per device (0 = disabled)
bool models_autoload = true; // automatically load models when requested via the router server

bool log_json = false;

Expand Down
13 changes: 13 additions & 0 deletions src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3493,6 +3493,19 @@ void llama_perf_context_reset(llama_context * ctx) {
ctx->perf_reset();
}

uint64_t llama_context_device_memory(const llama_context * ctx, ggml_backend_dev_t device) {
const bool is_host = ggml_backend_dev_type(device) == GGML_BACKEND_DEVICE_TYPE_CPU;
uint64_t total = 0;
for (const auto & [buft, mb] : ctx->memory_breakdown()) {
const bool matches = is_host ? ggml_backend_buft_is_host(buft) :
ggml_backend_buft_get_device(buft) == device;
if (matches) {
total += mb.total();
}
}
return total;
}

//
// training
//
Expand Down
6 changes: 6 additions & 0 deletions src/llama-ext.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,9 @@ LLAMA_API int32_t llama_model_n_devices(const struct llama_model * model);
LLAMA_API ggml_backend_dev_t llama_model_get_device(const struct llama_model * model, int i);

LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx);

// Returns the projected memory use (model + context + compute) in bytes
// for the given device within this context. Returns 0 if the device is not used.
LLAMA_API uint64_t llama_context_device_memory(
const struct llama_context * ctx,
ggml_backend_dev_t device);
8 changes: 7 additions & 1 deletion tools/server/server-http.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,13 @@ struct server_http_res {
return next != nullptr;
}

virtual ~server_http_res() = default;
std::function<void()> on_destroy = nullptr;

virtual ~server_http_res() {
if (on_destroy) {
on_destroy();
}
}
};

// unique pointer, used by set_chunked_content_provider
Expand Down
Loading
Loading