Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tools/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1651,6 +1651,7 @@ Note:
2. Adding `?reload=1` to the query params will refresh the list of models. The behavior is as follow:
- If a model is running but updated or removed from the source, it will be unloaded
- If a model is not running, it will be added or updated according to the source
3. When the model is loaded, the info from `/v1/models` is forwarded to router's `/v1/models`. This includes metadata about the model and the runtime instance.

The `status` object can be:

Expand Down
37 changes: 21 additions & 16 deletions tools/server/server-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3921,22 +3921,7 @@ void server_routes::init_routes() {
}},
{"object", "list"},
{"data", {
{
{"id", meta->model_name},
{"aliases", meta->model_aliases},
{"tags", meta->model_tags},
{"object", "model"},
{"created", std::time(0)},
{"owned_by", "llamacpp"},
{"meta", {
{"vocab_type", meta->model_vocab_type},
{"n_vocab", meta->model_vocab_n_tokens},
{"n_ctx_train", meta->model_n_ctx_train},
{"n_embd", meta->model_n_embd_inp},
{"n_params", meta->model_n_params},
{"size", meta->model_size},
}},
},
get_model_info(),
}}
};

Expand Down Expand Up @@ -4150,6 +4135,26 @@ void server_routes::init_routes() {
};
}

json server_routes::get_model_info() const {
return json {
{"id", meta->model_name},
{"aliases", meta->model_aliases},
{"tags", meta->model_tags},
{"object", "model"},
{"created", std::time(0)},
{"owned_by", "llamacpp"},
{"meta", {
{"vocab_type", meta->model_vocab_type},
{"n_vocab", meta->model_vocab_n_tokens},
{"n_ctx", meta->slot_n_ctx},
{"n_ctx_train", meta->model_n_ctx_train},
{"n_embd", meta->model_n_embd_inp},
{"n_params", meta->model_n_params},
{"size", meta->model_size},
}},
};
}

std::unique_ptr<server_res_generator> server_routes::handle_slots_save(const server_http_req & req, int id_slot) {
auto res = create_response();
const json request_data = json::parse(req.body);
Expand Down
4 changes: 4 additions & 0 deletions tools/server/server-context.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,10 @@ struct server_routes {
server_http_context::handler_t post_rerank;
server_http_context::handler_t get_lora_adapters;
server_http_context::handler_t post_lora_adapters;

// to be used in router mode
json get_model_info() const;

private:
std::unique_ptr<server_res_generator> handle_completions_impl(
const server_http_req & req,
Expand Down
56 changes: 48 additions & 8 deletions tools/server/server-models.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ extern char **environ;
#define CMD_ROUTER_TO_CHILD_EXIT "cmd_router_to_child:exit"
#define CMD_CHILD_TO_ROUTER_READY "cmd_child_to_router:ready" // also sent when waking up from sleep
#define CMD_CHILD_TO_ROUTER_SLEEP "cmd_child_to_router:sleep"
#define CMD_CHILD_TO_ROUTER_INFO "cmd_child_to_router:info:" // followed by json string

// address for child process, this is needed because router may run on 0.0.0.0
// ref: https://github.com/ggml-org/llama.cpp/issues/17862
Expand Down Expand Up @@ -718,10 +719,11 @@ void server_models::load(const std::string & name) {

// prepare new instance info
instance_t inst;
inst.meta = meta;
inst.meta.port = get_free_port();
inst.meta.status = SERVER_MODEL_STATUS_LOADING;
inst.meta.last_used = ggml_time_ms();
inst.meta = meta;
inst.meta.port = get_free_port();
inst.meta.status = SERVER_MODEL_STATUS_LOADING;
inst.meta.loaded_info = json{};
inst.meta.last_used = ggml_time_ms();

if (inst.meta.port <= 0) {
throw std::runtime_error("failed to get a port number");
Expand Down Expand Up @@ -767,12 +769,14 @@ void server_models::load(const std::string & name) {
// read stdout/stderr and forward to main server log
// also handle status report from child process
if (stdout_file) {
char buffer[4096];
char buffer[128 * 1024]; // large buffer for storing info
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better allocate this on the heap

while (fgets(buffer, sizeof(buffer), stdout_file) != nullptr) {
LOG("[%5d] %s", port, buffer);
std::string str(buffer);
if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) {
this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0);
} else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_INFO)) {
this->update_loaded_info(name, str);
} else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_SLEEP)) {
this->update_status(name, SERVER_MODEL_STATUS_SLEEPING, 0);
}
Expand Down Expand Up @@ -916,6 +920,29 @@ void server_models::update_status(const std::string & name, server_model_status
cv.notify_all();
}

void server_models::update_loaded_info(const std::string & name, std::string & raw_info) {
if (!string_starts_with(raw_info, CMD_CHILD_TO_ROUTER_INFO)) {
SRV_WRN("invalid loaded info format from child for model name=%s: %s\n", name.c_str(), raw_info.c_str());
return;
}

json info;
try {
info = json::parse(raw_info.substr(strlen(CMD_CHILD_TO_ROUTER_INFO)));
} catch (const std::exception & e) {
SRV_WRN("failed to parse loaded info from child for model name=%s: %s\n", name.c_str(), e.what());
return;
}

std::unique_lock<std::mutex> lk(mutex);
auto it = mapping.find(name);
if (it != mapping.end()) {
auto & meta = it->second.meta;
meta.loaded_info = info;
}
cv.notify_all();
}

void server_models::wait_until_loading_finished(const std::string & name) {
std::unique_lock<std::mutex> lk(mutex);
cv.wait(lk, [this, &name]() {
Expand Down Expand Up @@ -994,12 +1021,14 @@ bool server_models::is_child_server() {
return router_port != nullptr;
}

std::thread server_models::setup_child_server(const std::function<void(int)> & shutdown_handler) {
std::thread server_models::setup_child_server(const std::function<void(int)> & shutdown_handler, const json & model_info) {
// send a notification to the router server that a model instance is ready
common_log_pause(common_log_main());
fflush(stdout);
fprintf(stdout, "%s\n", CMD_CHILD_TO_ROUTER_READY);
fflush(stdout);
fprintf(stdout, "%s%s\n", CMD_CHILD_TO_ROUTER_INFO, safe_json_to_str(model_info).c_str());
fflush(stdout);
common_log_resume(common_log_main());

// setup thread for monitoring stdin
Expand Down Expand Up @@ -1176,7 +1205,8 @@ void server_models_routes::init_routes() {
status["exit_code"] = meta.exit_code;
status["failed"] = true;
}
models_json.push_back(json {

json model_info = json {
{"id", meta.name},
{"aliases", meta.aliases},
{"tags", meta.tags},
Expand All @@ -1185,7 +1215,17 @@ void server_models_routes::init_routes() {
{"created", t}, // for OAI-compat
{"status", status},
// TODO: add other fields, may require reading GGUF metadata
});
};

// merge with loaded_info from the child process if available
if (meta.is_running()) {
for (auto it = meta.loaded_info.begin(); it != meta.loaded_info.end(); ++it) {
if (!model_info.contains(it.key())) {
model_info[it.key()] = it.value();
}
}
}
models_json.push_back(model_info);
}
res_ok(res, {
{"data", models_json},
Expand Down
4 changes: 3 additions & 1 deletion tools/server/server-models.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ struct server_model_meta {
server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
int64_t last_used = 0; // for LRU unloading
std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
json loaded_info; // info to be reflected via /v1/models endpoint
int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown

Expand Down Expand Up @@ -145,6 +146,7 @@ struct server_models {

// update the status of a model instance (thread-safe)
void update_status(const std::string & name, server_model_status status, int exit_code);
void update_loaded_info(const std::string & name, std::string & raw_info);

// wait until the model instance is fully loaded (thread-safe)
// return when the model no longer in "loading" state
Expand All @@ -163,7 +165,7 @@ struct server_models {

// notify the router server that a model instance is ready
// return the monitoring thread (to be joined by the caller)
static std::thread setup_child_server(const std::function<void(int)> & shutdown_handler);
static std::thread setup_child_server(const std::function<void(int)> & shutdown_handler, const json & model_info);

// notify the router server that the sleeping state has changed
static void notify_router_sleeping_state(bool sleeping);
Expand Down
3 changes: 2 additions & 1 deletion tools/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,8 @@ int main(int argc, char ** argv) {
// optionally, notify router server that this instance is ready
std::thread monitor_thread;
if (server_models::is_child_server()) {
monitor_thread = server_models::setup_child_server(shutdown_handler);
json model_info = routes.get_model_info();
monitor_thread = server_models::setup_child_server(shutdown_handler, model_info);
}

// this call blocks the main thread until queue_tasks.terminate() is called
Expand Down
Loading