ggml-org · ngxson · May 8, 2026 · May 4, 2026 · May 4, 2026 · ggerganov
@@ -1651,6 +1651,7 @@ Note:
 2. Adding `?reload=1` to the query params will refresh the list of models. The behavior is as follow:
     - If a model is running but updated or removed from the source, it will be unloaded
     - If a model is not running, it will be added or updated according to the source
+3. When the model is loaded, the info from `/v1/models` is forwarded to router's `/v1/models`. This includes metadata about the model and the runtime instance.
 
 The `status` object can be:
 

@@ -3921,22 +3921,7 @@ void server_routes::init_routes() {
             }},
             {"object", "list"},
             {"data", {
-                {
-                    {"id",       meta->model_name},
-                    {"aliases",  meta->model_aliases},
-                    {"tags",     meta->model_tags},
-                    {"object",   "model"},
-                    {"created",  std::time(0)},
-                    {"owned_by", "llamacpp"},
-                    {"meta",     {
-                        {"vocab_type",  meta->model_vocab_type},
-                        {"n_vocab",     meta->model_vocab_n_tokens},
-                        {"n_ctx_train", meta->model_n_ctx_train},
-                        {"n_embd",      meta->model_n_embd_inp},
-                        {"n_params",    meta->model_n_params},
-                        {"size",        meta->model_size},
-                    }},
-                },
+                get_model_info(),
             }}
         };
 
@@ -4150,6 +4135,26 @@ void server_routes::init_routes() {
     };
 }
 
+json server_routes::get_model_info() const {
+    return json {
+        {"id",       meta->model_name},
+        {"aliases",  meta->model_aliases},
+        {"tags",     meta->model_tags},
+        {"object",   "model"},
+        {"created",  std::time(0)},
+        {"owned_by", "llamacpp"},
+        {"meta",     {
+            {"vocab_type",  meta->model_vocab_type},
+            {"n_vocab",     meta->model_vocab_n_tokens},
+            {"n_ctx",       meta->slot_n_ctx},
+            {"n_ctx_train", meta->model_n_ctx_train},
+            {"n_embd",      meta->model_n_embd_inp},
+            {"n_params",    meta->model_n_params},
+            {"size",        meta->model_size},
+        }},
+    };
+}
+
 std::unique_ptr<server_res_generator> server_routes::handle_slots_save(const server_http_req & req, int id_slot) {
     auto res = create_response();
     const json request_data = json::parse(req.body);

@@ -122,6 +122,10 @@ struct server_routes {
     server_http_context::handler_t post_rerank;
     server_http_context::handler_t get_lora_adapters;
     server_http_context::handler_t post_lora_adapters;
+
+    // to be used in router mode
+    json get_model_info() const;
+
 private:
     std::unique_ptr<server_res_generator> handle_completions_impl(
             const server_http_req & req,

@@ -44,6 +44,7 @@ extern char **environ;
 #define CMD_ROUTER_TO_CHILD_EXIT  "cmd_router_to_child:exit"
 #define CMD_CHILD_TO_ROUTER_READY "cmd_child_to_router:ready" // also sent when waking up from sleep
 #define CMD_CHILD_TO_ROUTER_SLEEP "cmd_child_to_router:sleep"
+#define CMD_CHILD_TO_ROUTER_INFO  "cmd_child_to_router:info:" // followed by json string
 
 // address for child process, this is needed because router may run on 0.0.0.0
 // ref: https://github.com/ggml-org/llama.cpp/issues/17862
@@ -718,10 +719,11 @@ void server_models::load(const std::string & name) {
 
     // prepare new instance info
     instance_t inst;
-    inst.meta           = meta;
-    inst.meta.port      = get_free_port();
-    inst.meta.status    = SERVER_MODEL_STATUS_LOADING;
-    inst.meta.last_used = ggml_time_ms();
+    inst.meta             = meta;
+    inst.meta.port        = get_free_port();
+    inst.meta.status      = SERVER_MODEL_STATUS_LOADING;
+    inst.meta.loaded_info = json{};
+    inst.meta.last_used   = ggml_time_ms();
 
     if (inst.meta.port <= 0) {
         throw std::runtime_error("failed to get a port number");
@@ -767,12 +769,14 @@ void server_models::load(const std::string & name) {
             // read stdout/stderr and forward to main server log
             // also handle status report from child process
             if (stdout_file) {
-                char buffer[4096];
+                char buffer[128 * 1024]; // large buffer for storing info
                 while (fgets(buffer, sizeof(buffer), stdout_file) != nullptr) {
                     LOG("[%5d] %s", port, buffer);
                     std::string str(buffer);
                     if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) {
                         this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0);
+                    } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_INFO)) {
+                        this->update_loaded_info(name, str);
                     } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_SLEEP)) {
                         this->update_status(name, SERVER_MODEL_STATUS_SLEEPING, 0);
                     }
@@ -916,6 +920,29 @@ void server_models::update_status(const std::string & name, server_model_status
     cv.notify_all();
 }
 
+void server_models::update_loaded_info(const std::string & name, std::string & raw_info) {
+    if (!string_starts_with(raw_info, CMD_CHILD_TO_ROUTER_INFO)) {
+        SRV_WRN("invalid loaded info format from child for model name=%s: %s\n", name.c_str(), raw_info.c_str());
+        return;
+    }
+
+    json info;
+    try {
+        info = json::parse(raw_info.substr(strlen(CMD_CHILD_TO_ROUTER_INFO)));
+    } catch (const std::exception & e) {
+        SRV_WRN("failed to parse loaded info from child for model name=%s: %s\n", name.c_str(), e.what());
+        return;
+    }
+
+    std::unique_lock<std::mutex> lk(mutex);
+    auto it = mapping.find(name);
+    if (it != mapping.end()) {
+        auto & meta = it->second.meta;
+        meta.loaded_info = info;
+    }
+    cv.notify_all();
+}
+
 void server_models::wait_until_loading_finished(const std::string & name) {
     std::unique_lock<std::mutex> lk(mutex);
     cv.wait(lk, [this, &name]() {
@@ -994,12 +1021,14 @@ bool server_models::is_child_server() {
     return router_port != nullptr;
 }
 
-std::thread server_models::setup_child_server(const std::function<void(int)> & shutdown_handler) {
+std::thread server_models::setup_child_server(const std::function<void(int)> & shutdown_handler, const json & model_info) {
     // send a notification to the router server that a model instance is ready
     common_log_pause(common_log_main());
     fflush(stdout);
     fprintf(stdout, "%s\n", CMD_CHILD_TO_ROUTER_READY);
     fflush(stdout);
+    fprintf(stdout, "%s%s\n", CMD_CHILD_TO_ROUTER_INFO, safe_json_to_str(model_info).c_str());
+    fflush(stdout);
     common_log_resume(common_log_main());
 
     // setup thread for monitoring stdin
@@ -1176,7 +1205,8 @@ void server_models_routes::init_routes() {
                 status["exit_code"] = meta.exit_code;
                 status["failed"]    = true;
             }
-            models_json.push_back(json {
+
+            json model_info = json {
                 {"id",       meta.name},
                 {"aliases",  meta.aliases},
                 {"tags",     meta.tags},
@@ -1185,7 +1215,17 @@ void server_models_routes::init_routes() {
                 {"created",  t},          // for OAI-compat
                 {"status",   status},
                 // TODO: add other fields, may require reading GGUF metadata
-            });
+            };
+
+            // merge with loaded_info from the child process if available
+            if (meta.is_running()) {
+                for (auto it = meta.loaded_info.begin(); it != meta.loaded_info.end(); ++it) {
+                    if (!model_info.contains(it.key())) {
+                        model_info[it.key()] = it.value();
+                    }
+                }
+            }
+            models_json.push_back(model_info);
         }
         res_ok(res, {
             {"data", models_json},

@@ -63,6 +63,7 @@ struct server_model_meta {
     server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
     int64_t last_used = 0; // for LRU unloading
     std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
+    json loaded_info; // info to be reflected via /v1/models endpoint
     int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
     int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
 
@@ -145,6 +146,7 @@ struct server_models {
 
     // update the status of a model instance (thread-safe)
     void update_status(const std::string & name, server_model_status status, int exit_code);
+    void update_loaded_info(const std::string & name, std::string & raw_info);
 
     // wait until the model instance is fully loaded (thread-safe)
     // return when the model no longer in "loading" state
@@ -163,7 +165,7 @@ struct server_models {
 
     // notify the router server that a model instance is ready
     // return the monitoring thread (to be joined by the caller)
-    static std::thread setup_child_server(const std::function<void(int)> & shutdown_handler);
+    static std::thread setup_child_server(const std::function<void(int)> & shutdown_handler, const json & model_info);
 
     // notify the router server that the sleeping state has changed
     static void notify_router_sleeping_state(bool sleeping);

@@ -329,7 +329,8 @@ int main(int argc, char ** argv) {
         // optionally, notify router server that this instance is ready
         std::thread monitor_thread;
         if (server_models::is_child_server()) {
-            monitor_thread = server_models::setup_child_server(shutdown_handler);
+            json model_info = routes.get_model_info();
+            monitor_thread = server_models::setup_child_server(shutdown_handler, model_info);
         }
 
         // this call blocks the main thread until queue_tasks.terminate() is called