From c222838e2d3619be5aaacf450fa21f77ee31ad9b Mon Sep 17 00:00:00 2001 From: Pascal Date: Wed, 20 May 2026 23:21:27 +0200 Subject: [PATCH] server: re-inject subcommand when router spawns children under unified binary --- app/llama.cpp | 9 +++++++++ tools/server/server-models.cpp | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/app/llama.cpp b/app/llama.cpp index 10f909e2f3b..66257c56992 100644 --- a/app/llama.cpp +++ b/app/llama.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -58,6 +59,14 @@ int main(int argc, char ** argv) { for (const auto & cmd : cmds) { if (matches(arg, cmd)) { + + // router spawns children through this same binary, it needs the + // subcommand to relaunch as 'llama serve' and not bare options +#ifdef _WIN32 + _putenv_s("LLAMA_APP_CMD", cmd.name); +#else + setenv("LLAMA_APP_CMD", cmd.name, 1); +#endif return cmd.func(argc - 1, argv + 1); } } diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index ccf42320f77..47b6c2a4ec0 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -159,6 +160,13 @@ void server_model_meta::update_args(common_preset_context & ctx_preset, std::str // TODO: maybe validate preset before rendering ? // render args args = preset.to_args(bin_path); + + // unified binary dispatches by subcommand, re-inject it right after the + // binary path so the child starts as 'llama serve ...' not 'llama ...' + const char * app_cmd = std::getenv("LLAMA_APP_CMD"); + if (app_cmd != nullptr && app_cmd[0] != '\0' && !bin_path.empty()) { + args.insert(args.begin() + 1, app_cmd); + } } void server_model_meta::update_caps() {