diff --git a/common/arg.cpp b/common/arg.cpp index e0f6c606608..91ce3a95f5b 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3127,6 +3127,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.models_max = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX")); + add_opt(common_arg( + {"--models-memory-margin"}, "N", + string_format("for router server, MiB of memory to leave free, per device (default: %d, 0 = unlimited)", params.models_memory_margin), + [](common_params & params, int value) { + params.models_memory_margin = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MEMORY_MARGIN")); add_opt(common_arg( {"--models-autoload"}, {"--no-models-autoload"}, @@ -3356,6 +3363,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.offline = true; } ).set_env("LLAMA_ARG_OFFLINE")); + add_opt(common_arg( + {"--download-only"}, + "Download the model file(s) and exit", + [](common_params & params) { + params.download_only = true; + } + )); + add_opt(common_arg( + {"--measure-only"}, + "Load the model to measure memory requirements, print to stdout, then exit", + [](common_params & params) { + params.measure_only = true; + } + )); add_opt(common_arg( {"-lv", "--verbosity", "--log-verbosity"}, "N", string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n" diff --git a/common/common.h b/common/common.h index 99898800d1d..34dcc9dd2dc 100644 --- a/common/common.h +++ b/common/common.h @@ -508,6 +508,8 @@ struct common_params { int32_t control_vector_layer_end = -1; // layer range for control vector bool offline = false; bool skip_download = false; // skip model file downloading + bool download_only = false; // only download the model if required, don't start the server + bool measure_only = false; // load model with no_alloc to measure memory, print to stdout, then exit int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line @@ -637,10 +639,11 @@ struct common_params { std::vector server_tools; // router server configs - std::string models_dir = ""; // directory containing models for the router server - std::string models_preset = ""; // directory containing model presets for the router server - int models_max = 4; // maximum number of models to load simultaneously - bool models_autoload = true; // automatically load models when requested via the router server + std::string models_dir = ""; // directory containing models for the router server + std::string models_preset = ""; // directory containing model presets for the router server + int models_max = 4; // maximum number of models to load simultaneously + int models_memory_margin = 1024; // MiB of free memory to preserve per device (0 = disabled) + bool models_autoload = true; // automatically load models when requested via the router server bool log_json = false; diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 49b0e423f46..ea6dd2d4014 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -98,6 +98,7 @@ static void unset_reserved_args(common_preset & preset, bool unset_model_args) { preset.unset_option("LLAMA_API_KEY"); preset.unset_option("LLAMA_ARG_MODELS_DIR"); preset.unset_option("LLAMA_ARG_MODELS_MAX"); + preset.unset_option("LLAMA_ARG_MODELS_MEMORY_MARGIN"); preset.unset_option("LLAMA_ARG_MODELS_PRESET"); preset.unset_option("LLAMA_ARG_MODELS_AUTOLOAD"); if (unset_model_args) { @@ -213,9 +214,39 @@ server_models::server_models( bin_path = get_server_exec_path().string(); } catch (const std::exception & e) { bin_path = argv[0]; - LOG_WRN("failed to get server executable path: %s\n", e.what()); - LOG_WRN("using original argv[0] as fallback: %s\n", argv[0]); + SRV_WRN("failed to get server executable path: %s\n", e.what()); + SRV_WRN("using original argv[0] as fallback: %s\n", argv[0]); } + + const size_t memory_margin = (size_t) base_params.models_memory_margin * 1024 * 1024; + + if (memory_margin > 0) { + ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + ggml_backend_buffer_type_t cpu_buft = cpu_dev ? ggml_backend_dev_buffer_type(cpu_dev) : nullptr; + + const size_t n_devs = ggml_backend_dev_count(); + for (size_t i = 0; i < n_devs; i++) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + ggml_backend_buffer_type_t dev_buft = ggml_backend_dev_buffer_type(dev); + if (dev_buft) { + buft_by_name[ggml_backend_buft_name(dev_buft)] = dev_buft; + } + ggml_backend_buffer_type_t host_buft = ggml_backend_dev_host_buffer_type(dev); + if (host_buft && cpu_buft) { + buft_by_name[ggml_backend_buft_name(host_buft)] = cpu_buft; + } + + size_t free, total; + ggml_backend_dev_memory(dev, &free, &total); + if (total > 0 && dev_buft) { + const size_t available = (free > memory_margin) ? free - memory_margin : 0; + bmm_available[dev_buft] = available; + SRV_DBG("buft %s: available memory after margin=%zu MiB\n", + ggml_backend_buft_name(dev_buft), available / (1024 * 1024)); + } + } + } + load_models(); } @@ -379,6 +410,7 @@ void server_models::load_models() { /* port */ 0, /* status */ SERVER_MODEL_STATUS_UNLOADED, /* last_used */ 0, + /* bmm_req */ {}, /* args */ std::vector(), /* loaded_info */ {}, /* exit_code */ 0, @@ -533,6 +565,7 @@ void server_models::load_models() { /* port */ 0, /* status */ SERVER_MODEL_STATUS_UNLOADED, /* last_used */ 0, + /* bmm_req */ {}, /* args */ std::vector(), /* loaded_info */ {}, /* exit_code */ 0, @@ -692,30 +725,87 @@ std::vector server_models::get_all_meta() { return result; } -void server_models::unload_lru() { - if (base_params.models_max <= 0) { - return; // no limit +int server_models::can_fit(const buft_memory_map & bmm_req) const { + buft_memory_map bmm_total; + for (const auto & m : mapping) { + if (m.second.meta.is_running()) { + for (const auto & [buft, mem] : m.second.meta.bmm_req) { + bmm_total[buft] += mem; + } + } } - // remove one of the servers if we passed the models_max (least recently used - LRU) - std::string lru_model_name = ""; - int64_t lru_last_used = ggml_time_ms(); - size_t count_active = 0; - { - std::unique_lock lk(mutex); - for (const auto & m : mapping) { - if (m.second.meta.is_running()) { - count_active++; - if (m.second.meta.last_used < lru_last_used) { + + auto get = [](const buft_memory_map & dmm, ggml_backend_buffer_type_t buft) -> size_t { + auto it = dmm.find(buft); + return it != dmm.end() ? it->second : 0; + }; + + int res = 0; + + for (const auto & [buft, limit] : bmm_available) { + const size_t mem_total = get(bmm_total, buft); + const size_t mem_new = get(bmm_req, buft); + + SRV_DBG("buft %s: total=%zu MiB, new=%zu MiB, limit=%zu MiB\n", + ggml_backend_buft_name(buft), + mem_total / (1024 * 1024), mem_new / (1024 * 1024), limit / (1024 * 1024)); + + if (mem_total + mem_new > limit) { + res++; + } + } + + return res; +} + +bool server_models::limits_exceeded(const buft_memory_map & bmm_req) const { + const bool check_active = base_params.models_max > 0; + const bool check_memory = base_params.models_memory_margin > 0; + + if (!check_active && !check_memory) { + return false; + } + + int count_active = 0; + for (const auto & m : mapping) { + if (m.second.meta.is_running()) { + count_active++; + } + } + + const bool active_exceeded = check_active && count_active >= base_params.models_max; + const bool memory_exceeded = check_memory && can_fit(bmm_req) > 0; + + return active_exceeded || memory_exceeded; +} + +void server_models::unload_lru(const buft_memory_map & bmm_req) { + if (base_params.models_memory_margin > 0) { + GGML_ASSERT(!bmm_available.empty()); + } + + while (true) { + std::string lru_model_name; + { + std::unique_lock lk(mutex); + if (!limits_exceeded(bmm_req)) { + break; + } + int64_t lru_last_used = ggml_time_ms(); + for (const auto & m : mapping) { + if (m.second.meta.is_running() && m.second.meta.last_used < lru_last_used) { lru_model_name = m.first; - lru_last_used = m.second.meta.last_used; + lru_last_used = m.second.meta.last_used; } } } - } - if (!lru_model_name.empty() && count_active >= (size_t)base_params.models_max) { - SRV_INF("models_max limit reached, removing LRU name=%s\n", lru_model_name.c_str()); + + if (lru_model_name.empty()) { + break; + } + + SRV_INF("limits exceeded, removing LRU name=%s\n", lru_model_name.c_str()); unload(lru_model_name); - // wait for unload to complete { std::unique_lock lk(mutex); cv.wait(lk, [this, &lru_model_name]() { @@ -725,11 +815,238 @@ void server_models::unload_lru() { } } +static std::string resolve_model_path(const common_preset & preset) { + common_params params; + preset.apply_to_params(params); + + if (!params.model.path.empty()) { + return params.model.path; + } + + if (!params.model.hf_repo.empty() || !params.model.url.empty()) { + common_download_opts opts; + opts.offline = true; + auto result = common_download_model(params.model, opts); + return result.model_path; + } + + return ""; +} + +bool server_models::download_model(const std::string & name) { + std::vector child_args; + std::vector child_env; + { + std::lock_guard lk(mutex); + auto & meta = mapping[name].meta; + child_args = meta.preset.to_args(bin_path); + child_env = base_env; + } + child_args.push_back("--download-only"); + + SRV_INF("downloading model name=%s\n", name.c_str()); + + std::vector argv = to_char_ptr_array(child_args); + std::vector envp = to_char_ptr_array(child_env); + + subprocess_s proc; + int options = subprocess_option_no_window | subprocess_option_combined_stdout_stderr; + if (subprocess_create_ex(argv.data(), options, envp.data(), &proc) != 0) { + SRV_ERR("failed to spawn download process for model name=%s\n", name.c_str()); + return false; + } + + FILE * out = subprocess_stdout(&proc); + if (out) { + char buffer[4096]; + while (fgets(buffer, sizeof(buffer), out) != nullptr) { + LOG("[dl:%s] %s", name.c_str(), buffer); + } + } + + int exit_code = 0; + subprocess_join(&proc, &exit_code); + subprocess_destroy(&proc); + + if (exit_code != 0) { + SRV_ERR("download process for model name=%s exited with code %d\n", name.c_str(), exit_code); + return false; + } + + SRV_INF("download complete for model name=%s\n", name.c_str()); + return true; +} + +buft_memory_map server_models::estimate_model_memory(const std::string & name) { + std::vector child_args; + std::vector child_env; + { + std::lock_guard lk(mutex); + auto & meta = mapping[name].meta; + child_args = meta.preset.to_args(bin_path); + child_env = base_env; + } + child_args.push_back("--measure-only"); + child_args.push_back("--offline"); + + SRV_INF("estimating memory for model name=%s\n", name.c_str()); + + std::vector argv = to_char_ptr_array(child_args); + std::vector envp = to_char_ptr_array(child_env); + + subprocess_s proc; + int options = subprocess_option_no_window | subprocess_option_combined_stdout_stderr; + if (subprocess_create_ex(argv.data(), options, envp.data(), &proc) != 0) { + SRV_ERR("failed to spawn measure process for model name=%s\n", name.c_str()); + return {}; + } + + buft_memory_map result; + FILE * out = subprocess_stdout(&proc); + if (out) { + char buffer[4096]; + while (fgets(buffer, sizeof(buffer), out) != nullptr) { + LOG("[measure:%s] %s", name.c_str(), buffer); + std::string line(buffer); + if (string_starts_with(line, "measure:")) { + std::istringstream iss(line.substr(strlen("measure:"))); + std::string buft_name; + size_t size = 0; + if (iss >> buft_name >> size) { + auto it = buft_by_name.find(buft_name); + if (it != buft_by_name.end()) { + result[it->second] += size; + } else { + SRV_WRN("unknown buft name '%s' from measure child for model name=%s\n", + buft_name.c_str(), name.c_str()); + } + } + } + } + } + + int exit_code = 0; + subprocess_join(&proc, &exit_code); + subprocess_destroy(&proc); + + if (exit_code != 0) { + SRV_ERR("measure process for model name=%s exited with code %d\n", name.c_str(), exit_code); + return {}; + } + + SRV_INF("memory estimation complete for model name=%s\n", name.c_str()); + return result; +} + +void server_models::join_completed_bg_tasks() { + std::vector> to_join; + { + std::lock_guard lk(mutex); + for (auto it = bg_tasks.begin(); it != bg_tasks.end(); ) { + if (it->second->done.load()) { + to_join.push_back(std::move(it->second)); + it = bg_tasks.erase(it); + } else { + ++it; + } + } + } + for (auto & task : to_join) { + if (task->th.joinable()) { + task->th.join(); + } + } +} + void server_models::load(const std::string & name) { if (!has_model(name)) { throw std::runtime_error("model name=" + name + " is not found"); } - unload_lru(); + + join_completed_bg_tasks(); + + { + common_preset preset_copy; + { + std::lock_guard lk(mutex); + preset_copy = mapping[name].meta.preset; + } + if (resolve_model_path(preset_copy).empty()) { + { + std::lock_guard lk(mutex); + auto & meta = mapping[name].meta; + if (meta.status != SERVER_MODEL_STATUS_UNLOADED) { + return; + } + // skip if a bg_task is already running for this model + auto bg_it = bg_tasks.find(name); + if (bg_it != bg_tasks.end() && !bg_it->second->done.load()) { + return; + } + meta.status = SERVER_MODEL_STATUS_DOWNLOADING; + cv.notify_all(); + } + + auto task = std::make_unique(); + auto * task_ptr = task.get(); + + task->th = std::thread([this, name, task_ptr]() { + if (!download_model(name)) { + update_status(name, SERVER_MODEL_STATUS_UNLOADED, 1); + task_ptr->done.store(true); + return; + } + buft_memory_map mem; + if (base_params.models_memory_margin > 0) { + mem = estimate_model_memory(name); + if (mem.empty()) { + SRV_WRN("failed to estimate memory for model %s, memory limits will not apply\n", name.c_str()); + } + { + std::lock_guard lk(mutex); + mapping[name].meta.bmm_req = mem; + } + } + try { + _load(name, mem); + } catch (const std::exception & e) { + SRV_ERR("failed to load model %s after download: %s\n", name.c_str(), e.what()); + update_status(name, SERVER_MODEL_STATUS_UNLOADED, 1); + } + task_ptr->done.store(true); + }); + + { + std::lock_guard lk(mutex); + bg_tasks[name] = std::move(task); + } + return; + } + } + + buft_memory_map bmm_req; + if (base_params.models_memory_margin > 0) { + { + std::lock_guard lk(mutex); + bmm_req = mapping[name].meta.bmm_req; + } + if (bmm_req.empty()) { + bmm_req = estimate_model_memory(name); + if (bmm_req.empty()) { + SRV_WRN("failed to estimate memory for model %s, memory limits will not apply\n", name.c_str()); + } + { + std::lock_guard lk(mutex); + mapping[name].meta.bmm_req = bmm_req; + } + } + } + + _load(name, bmm_req); +} + +void server_models::_load(const std::string & name, const buft_memory_map & bmm_req) { + unload_lru(bmm_req); std::unique_lock lk(mutex); // edge case: block until any in-progress reload has finished so we always load @@ -737,7 +1054,7 @@ void server_models::load(const std::string & name) { cv.wait(lk, [this]() { return !is_reloading; }); auto meta = mapping[name].meta; - if (meta.status != SERVER_MODEL_STATUS_UNLOADED) { + if (meta.status != SERVER_MODEL_STATUS_UNLOADED && meta.status != SERVER_MODEL_STATUS_DOWNLOADING) { SRV_INF("model %s is not ready\n", name.c_str()); return; } @@ -746,16 +1063,8 @@ void server_models::load(const std::string & name) { // exceeding models_max. Without this, the window between unload_lru() // releasing its lock and this lock_guard acquiring allows multiple // threads to each observe capacity and all proceed to load. - if (base_params.models_max > 0) { - size_t count_active = 0; - for (const auto & m : mapping) { - if (m.second.meta.is_running()) { - count_active++; - } - } - if (count_active >= (size_t)base_params.models_max) { - throw std::runtime_error("model limit reached, try again later"); - } + if (limits_exceeded(bmm_req)) { + throw std::runtime_error("model limit reached, try again later"); } // prepare new instance info @@ -931,6 +1240,7 @@ void server_models::unload(const std::string & name) { void server_models::unload_all() { std::vector to_join; + std::vector> bg_to_join; { std::lock_guard lk(mutex); for (auto & [name, inst] : mapping) { @@ -943,15 +1253,26 @@ void server_models::unload_all() { // moving the thread to join list to avoid deadlock to_join.push_back(std::move(inst.th)); } + for (auto & [name, task] : bg_tasks) { + bg_to_join.push_back(std::move(task)); + } + bg_tasks.clear(); } for (auto & th : to_join) { if (th.joinable()) { th.join(); } } + for (auto & task : bg_to_join) { + if (task && task->th.joinable()) { + task->th.join(); + } + } } void server_models::update_status(const std::string & name, server_model_status status, int exit_code) { + join_completed_bg_tasks(); + std::unique_lock lk(mutex); auto it = mapping.find(name); if (it != mapping.end()) { @@ -990,7 +1311,8 @@ void server_models::wait_until_loading_finished(const std::string & name) { cv.wait(lk, [this, &name]() { auto it = mapping.find(name); if (it != mapping.end()) { - return it->second.meta.status != SERVER_MODEL_STATUS_LOADING; + return it->second.meta.status != SERVER_MODEL_STATUS_LOADING && + it->second.meta.status != SERVER_MODEL_STATUS_DOWNLOADING; } return false; }); diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 2198589a7aa..69bbf87bdc2 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -5,15 +5,20 @@ #include "server-common.h" #include "server-http.h" +#include #include #include #include #include #include +#include /** * state diagram: * + * + * ┌► DOWNLOADING ─┐ + * │ ▼ * UNLOADED ──► LOADING ──► LOADED ◄──── SLEEPING * ▲ │ │ ▲ * └───failed───┘ │ │ @@ -21,8 +26,8 @@ * └────────unloaded─────────┘ */ enum server_model_status { - // TODO: also add downloading state when the logic is added SERVER_MODEL_STATUS_UNLOADED, + SERVER_MODEL_STATUS_DOWNLOADING, SERVER_MODEL_STATUS_LOADING, SERVER_MODEL_STATUS_LOADED, SERVER_MODEL_STATUS_SLEEPING @@ -32,6 +37,9 @@ static server_model_status server_model_status_from_string(const std::string & s if (status_str == "unloaded") { return SERVER_MODEL_STATUS_UNLOADED; } + if (status_str == "downloading") { + return SERVER_MODEL_STATUS_DOWNLOADING; + } if (status_str == "loading") { return SERVER_MODEL_STATUS_LOADING; } @@ -46,14 +54,17 @@ static server_model_status server_model_status_from_string(const std::string & s static std::string server_model_status_to_string(server_model_status status) { switch (status) { - case SERVER_MODEL_STATUS_UNLOADED: return "unloaded"; - case SERVER_MODEL_STATUS_LOADING: return "loading"; - case SERVER_MODEL_STATUS_LOADED: return "loaded"; - case SERVER_MODEL_STATUS_SLEEPING: return "sleeping"; - default: return "unknown"; + case SERVER_MODEL_STATUS_UNLOADED: return "unloaded"; + case SERVER_MODEL_STATUS_DOWNLOADING: return "downloading"; + case SERVER_MODEL_STATUS_LOADING: return "loading"; + case SERVER_MODEL_STATUS_LOADED: return "loaded"; + case SERVER_MODEL_STATUS_SLEEPING: return "sleeping"; + default: return "unknown"; } } +using buft_memory_map = std::map; + struct server_model_meta { common_preset preset; std::string name; @@ -62,6 +73,7 @@ struct server_model_meta { int port = 0; server_model_status status = SERVER_MODEL_STATUS_UNLOADED; int64_t last_used = 0; // for LRU unloading + buft_memory_map bmm_req; // bytes required per buffer type std::vector args; // args passed to the model instance, will be populated by render_args() json loaded_info; // info to be reflected via /v1/models endpoint int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) @@ -104,6 +116,13 @@ struct server_models { std::condition_variable cv_stop; std::set stopping_models; + // background tasks for download/estimate/load pipelines, keyed by model name + struct bg_task { + std::thread th; + std::atomic done{false}; + }; + std::map> bg_tasks; + // set to true while load_models() is executing a reload; load() will wait until clear bool is_reloading = false; @@ -114,14 +133,42 @@ struct server_models { std::vector base_env; common_preset base_preset; // base preset from llama-server CLI args + // available memory per buffer type + buft_memory_map bmm_available; + + // buft name -> buft lookup (host buffer types map to CPU buft) + std::unordered_map buft_by_name; + void update_meta(const std::string & name, const server_model_meta & meta); // unload least recently used models if the limit is reached - void unload_lru(); + void unload_lru(const buft_memory_map & bmm_req); // not thread-safe, caller must hold mutex void add_model(server_model_meta && meta); + // return number of buffer types where the memory limit would be exceeded + // return 0 if the new model would fit + // not thread-safe, caller must hold mutex + int can_fit(const buft_memory_map & bmm_req) const; + + // check if active model count or memory limits would be exceeded + // not thread-safe, caller must hold mutex + bool limits_exceeded(const buft_memory_map & bmm_req) const; + + // download model files, blocking call (caller must NOT hold mutex) + bool download_model(const std::string & name); + + // estimate model memory by spawning a child process with --measure-only + // returns the buft memory map, or empty map on failure (caller must NOT hold mutex) + buft_memory_map estimate_model_memory(const std::string & name); + + // join and remove completed background tasks + void join_completed_bg_tasks(); + + // Internal helper for model loading + void _load(const std::string & name, const buft_memory_map & bmm_req); + public: server_models(const common_params & params, int argc, char ** argv); diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 4d56d45e83c..eae50776fb1 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -11,6 +11,8 @@ #include "llama.h" #include "log.h" +#include "../../src/llama-ext.h" + #include #include #include @@ -94,6 +96,11 @@ int llama_server(int argc, char ** argv) { const bool is_router_server = params.model.path.empty(); common_params_print_info(params, !is_router_server); + if (params.download_only) { + LOG_INF("%s: model downloaded successfully, exiting\n", __func__); + return 0; + } + // validate batch size for embeddings // embeddings require all tokens to be processed in a single ubatch // see https://github.com/ggml-org/llama.cpp/issues/12836 @@ -118,6 +125,44 @@ int llama_server(int argc, char ** argv) { // struct that contains llama context and inference server_context ctx_server; + if (params.measure_only) { + llama_model_params mparams = common_model_params_to_llama(params); + mparams.no_alloc = true; + mparams.use_mmap = false; + mparams.use_mlock = false; + + llama_model_ptr model{llama_model_load_from_file(params.model.path.c_str(), mparams)}; + if (!model) { + LOG_ERR("%s: failed to load model for measurement\n", __func__); + llama_backend_free(); + return 1; + } + + llama_context_params cparams = common_context_params_to_llama(params); + llama_context_ptr ctx{llama_init_from_model(model.get(), cparams)}; + if (!ctx) { + LOG_ERR("%s: failed to create context for measurement\n", __func__); + llama_backend_free(); + return 1; + } + + common_log_pause(common_log_main()); + for (const auto & [buft, data] : llama_get_memory_breakdown(ctx.get())) { + size_t total = data.total(); + if (total > 0) { + fprintf(stdout, "measure:%s %zu\n", ggml_backend_buft_name(buft), total); + } + } + fflush(stdout); + common_log_resume(common_log_main()); + + llama_backend_free(); + return 0; + } + + LOG_INF("build_info: %s\n", llama_build_info()); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); + server_http_context ctx_http; if (!ctx_http.init(params)) { SRV_ERR("%s", "failed to initialize HTTP server\n");