From 034403de7147702223dff9039c586dead0c87f22 Mon Sep 17 00:00:00 2001 From: Justin Parker Date: Mon, 5 Feb 2024 13:42:23 -0500 Subject: [PATCH 1/5] include total "num_slots" in default_generation_settings_for_props --- examples/server/server.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 8000fee5c90d7..9fb00ec68f3a1 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -432,6 +432,7 @@ struct llama_server_context } default_generation_settings_for_props = get_formated_generation(slots.front()); + default_generation_settings_for_props["num_slots"] = params.n_parallel; default_generation_settings_for_props["seed"] = -1; batch = llama_batch_init(n_ctx, 0, params.n_parallel); From 4ff37ea41f3a68bf92b8a49caa7dae36c93ab16b Mon Sep 17 00:00:00 2001 From: Justin Parker Date: Tue, 6 Feb 2024 14:53:02 -0500 Subject: [PATCH 2/5] cleanup total_slots return value in /props endpoint --- examples/server/server.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 9fb00ec68f3a1..f5e0522e9d526 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -432,7 +432,6 @@ struct llama_server_context } default_generation_settings_for_props = get_formated_generation(slots.front()); - default_generation_settings_for_props["num_slots"] = params.n_parallel; default_generation_settings_for_props["seed"] = -1; batch = llama_batch_init(n_ctx, 0, params.n_parallel); @@ -2620,7 +2619,8 @@ int main(int argc, char **argv) json data = { { "user_name", llama.name_user.c_str() }, { "assistant_name", llama.name_assistant.c_str() }, - { "default_generation_settings", llama.default_generation_settings_for_props } + { "default_generation_settings", llama.default_generation_settings_for_props }, + { "total_slots", llama.params.n_parallel } }; res.set_content(data.dump(), "application/json; charset=utf-8"); }); From 6ada782e804aeda0db5e7efe5803780c701983f1 Mon Sep 17 00:00:00 2001 From: Justin Parker Date: Tue, 6 Feb 2024 14:53:33 -0500 Subject: [PATCH 3/5] update /props endpoint docs with total_slots --- examples/server/README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/server/README.md b/examples/server/README.md index d8e7c313e1732..038f6641b3416 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -272,13 +272,15 @@ Notice that each `probs` is an array of length `n_probs`. { "assistant_name": "", "user_name": "", - "default_generation_settings": { ... } + "default_generation_settings": { ... }, + "total_slots": "" } ``` - `assistant_name` - the required assistant name to generate the prompt in case you have specified a system prompt for all slots. - `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots. - `default_generation_settings` - the default generation settings for the `/completion` endpoint, has the same fields as the `generation_settings` response object from the `/completion` endpoint. +- `total_slots` - the total number of slots for process requests (defined by `--parallel` option) - **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served. From 2ea831b21220f7ae4688d46e68b1762e49c4f9f1 Mon Sep 17 00:00:00 2001 From: Justin Parker Date: Tue, 6 Feb 2024 14:56:42 -0500 Subject: [PATCH 4/5] remove num_slots from default_generation_settings_for_props --- examples/server/server.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 41c8a74e78c3c..eceda30d05fcc 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -432,7 +432,6 @@ struct llama_server_context } default_generation_settings_for_props = get_formated_generation(slots.front()); - default_generation_settings_for_props["num_slots"] = params.n_parallel; default_generation_settings_for_props["seed"] = -1; batch = llama_batch_init(n_ctx, 0, params.n_parallel); From 9db65d2d0eeab7df6efa20578bc2d7345a9c86f0 Mon Sep 17 00:00:00 2001 From: Justin Parker Date: Tue, 6 Feb 2024 16:58:44 -0500 Subject: [PATCH 5/5] update /props endpoint section --- examples/server/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/README.md b/examples/server/README.md index 1aa860ff75ef0..1db7cdf2191a7 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -277,7 +277,7 @@ Notice that each `probs` is an array of length `n_probs`. "assistant_name": "", "user_name": "", "default_generation_settings": { ... }, - "total_slots": "" + "total_slots": 1 } ```