Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 43 additions & 1 deletion examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.

**Features:**
* LLM inference of F16 and quantized models on GPU and CPU
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions, responses, and embeddings routes
* Parallel decoding with multi-user support
* Continuous batching
* Multimodal (wip)
Expand Down Expand Up @@ -706,6 +706,48 @@ curl http://localhost:8080/v1/chat/completions \

**See our [Function calling](../../docs/function-calling.md) docs** for more details, supported native tool call styles (generic tool call style is used as fallback) / examples of use.

### POST `/v1/responses`: OpenAI-compatible Responses API

*Options:*

See [OpenAI Responses API documentation](https://platform.openai.com/docs/api-reference/responses).

*Examples:*

You can use either Python `openai` library with appropriate checkpoints:

```python
import openai

client = openai.OpenAI(
base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
api_key = "sk-no-key-required"
)

response = client.responses.create(
model="gpt-4.1",
instructions="You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests.",
input="Write a limerick about python exceptions"
)

print(response.output_text)
```

... or raw HTTP requests:

```shell
curl http://localhost:8080/v1/responses \
-H "Content-Type: application/json" \
-H "Authorization: Bearer no-key" \
-d '{
"model": "gpt-4.1",
"instructions": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests.",
"input": "Write a limerick about python exceptions"
}'
```

This endpoint works by converting Responses requests into Chat Completions requests.

### POST `/v1/embeddings`: OpenAI-compatible embeddings API

*Options:*
Expand Down
270 changes: 270 additions & 0 deletions examples/server/server-common.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#include "server-common.h"

#include <algorithm>

using raw_buffer = std::vector<uint8_t>;


Expand Down Expand Up @@ -505,6 +507,30 @@ bool server_sent_event(httplib::DataSink& sink, const json& data) {
return true;
}

bool server_sent_oai_resp_event(httplib::DataSink& sink, const json& data) {
static auto send_single = [](httplib::DataSink& sink, const json& data) -> bool {
const std::string str =
"event: " + data.at("event").get<std::string>() + "\n" +
"data: " + data.at("data").dump(-1, ' ', false, json::error_handler_t::replace) + "\n\n";

LOG_DBG("data stream, to_send: %s", str.c_str());
return sink.write(str.c_str(), str.size());
};

if (data.is_array()) {
for (const auto& item : data) {
if (!send_single(sink, item)) {
return false;
}
}
}
else {
return send_single(sink, data);
}

return true;
}

bool server_sent_anthropic_event(httplib::DataSink& sink, const json& data) {
static auto send_single = [](httplib::DataSink& sink, const json& data) -> bool {
const std::string str =
Expand Down Expand Up @@ -874,6 +900,250 @@ json oaicompat_chat_params_parse(
return llama_params;
}

json convert_responses_to_chatcmpl(const json& response_body) {
if (!response_body.contains("input")) {
throw std::runtime_error("'input' is required");
}
if (!json_value(response_body, "previous_response_id", std::string{}).empty()) {
throw std::runtime_error("ik_llama.cpp does not support 'previous_response_id'.");
}

const json input_value = response_body.at("input");
json chatcmpl_body = response_body;
chatcmpl_body.erase("input");
std::vector<json> chatcmpl_messages;

if (response_body.contains("instructions")) {
chatcmpl_messages.push_back({
{"role", "system"},
{"content", json_value(response_body, "instructions", std::string())},
});
chatcmpl_body.erase("instructions");
}

if (input_value.is_string()) {
chatcmpl_messages.push_back({
{"role", "user"},
{"content", input_value},
});
}
else if (input_value.is_array()) {
static auto exists_and_is_array = [](const json& j, const char* key) -> bool {
return j.contains(key) && j.at(key).is_array();
};
static auto exists_and_is_string = [](const json& j, const char* key) -> bool {
return j.contains(key) && j.at(key).is_string();
};

for (json item : input_value) {
if (exists_and_is_string(item, "content")) {
item["content"] = json::array({
json{
{"text", item.at("content")},
{"type", "input_text"},
}
});
}

if (exists_and_is_array(item, "content") &&
exists_and_is_string(item, "role") &&
(item.at("role") == "user" || item.at("role") == "system" || item.at("role") == "developer")
) {
std::vector<json> chatcmpl_content;

for (const json& input_item : item.at("content")) {
const std::string type = json_value(input_item, "type", std::string());

if (type == "input_text") {
if (!input_item.contains("text")) {
throw std::runtime_error("'Input text' requires 'text'");
}
chatcmpl_content.push_back({
{"text", input_item.at("text")},
{"type", "text"},
});
}
else if (type == "input_image") {
if (!input_item.contains("image_url")) {
throw std::runtime_error("'image_url' is required");
}
chatcmpl_content.push_back({
{"image_url", json{
{"url", input_item.at("image_url")},
}},
{"type", "image_url"},
});
}
else if (type == "input_file") {
throw std::runtime_error("'input_file' is not supported by ik_llama.cpp at this moment");
}
else {
throw std::runtime_error("'type' must be one of 'input_text', 'input_image', or 'input_file'");
}
}

if (item.contains("type")) {
item.erase("type");
}
if (item.contains("status")) {
item.erase("status");
}
item["content"] = chatcmpl_content;

chatcmpl_messages.push_back(item);
}
else if (exists_and_is_array(item, "content") &&
exists_and_is_string(item, "role") &&
item.at("role") == "assistant" &&
exists_and_is_string(item, "type") &&
item.at("type") == "message"
) {
std::vector<json> chatcmpl_content;

for (const auto& output_text : item.at("content")) {
const std::string type = json_value(output_text, "type", std::string());
if (type != "output_text") {
throw std::runtime_error("'type' must be 'output_text'");
}
if (!exists_and_is_string(output_text, "text")) {
throw std::runtime_error("'Output text' requires 'text'");
}
chatcmpl_content.push_back({
{"text", output_text.at("text")},
{"type", "text"},
});
}

item.erase("status");
item.erase("type");
item["content"] = chatcmpl_content;
chatcmpl_messages.push_back(item);
}
else if (exists_and_is_string(item, "arguments") &&
exists_and_is_string(item, "call_id") &&
exists_and_is_string(item, "name") &&
exists_and_is_string(item, "type") &&
item.at("type") == "function_call"
) {
json msg = json{
{"role", "assistant"},
{"tool_calls", json::array({json{
{"function", json{
{"arguments", item.at("arguments")},
{"name", item.at("name")},
}},
{"id", item.at("call_id")},
{"type", "function"},
}})},
};

if (!chatcmpl_messages.empty() && chatcmpl_messages.back().contains("reasoning_content")) {
msg["reasoning_content"] = chatcmpl_messages.back().at("reasoning_content");
chatcmpl_messages.pop_back();
}
chatcmpl_messages.push_back(msg);
}
else if (exists_and_is_string(item, "call_id") &&
(exists_and_is_string(item, "output") || exists_and_is_array(item, "output")) &&
exists_and_is_string(item, "type") &&
item.at("type") == "function_call_output"
) {
if (item.at("output").is_string()) {
chatcmpl_messages.push_back(json{
{"content", item.at("output")},
{"role", "tool"},
{"tool_call_id", item.at("call_id")},
});
}
else {
json chatcmpl_outputs = item.at("output");
for (json& chatcmpl_output : chatcmpl_outputs) {
if (!chatcmpl_output.contains("type") || chatcmpl_output.at("type") != "input_text") {
throw std::runtime_error("Output of tool call should be 'Input text'");
}
chatcmpl_output["type"] = "text";
}
chatcmpl_messages.push_back(json{
{"content", chatcmpl_outputs},
{"role", "tool"},
{"tool_call_id", item.at("call_id")},
});
}
}
else if (exists_and_is_array(item, "summary") &&
exists_and_is_string(item, "type") &&
item.at("type") == "reasoning") {
if (!exists_and_is_array(item, "content")) {
throw std::runtime_error("item['content'] is not an array");
}
if (item.at("content").empty()) {
throw std::runtime_error("item['content'] is empty");
}
if (!exists_and_is_string(item.at("content")[0], "text")) {
throw std::runtime_error("item['content']['text'] is not a string");
}

chatcmpl_messages.push_back(json{
{"role", "assistant"},
{"content", json::array()},
{"reasoning_content", item.at("content")[0].at("text")},
});
}
else {
throw std::runtime_error("Cannot determine type of 'item'");
}
}
}
else {
throw std::runtime_error("'input' must be a string or array of objects");
}

chatcmpl_messages.erase(std::remove_if(
chatcmpl_messages.begin(),
chatcmpl_messages.end(),
[](const json& x) {
return x.contains("role") &&
x.at("role") == "assistant" &&
x.contains("content") &&
x.at("content") == json::array() &&
x.contains("reasoning_content");
}),
chatcmpl_messages.end());

chatcmpl_body["messages"] = chatcmpl_messages;

if (response_body.contains("tools")) {
if (!response_body.at("tools").is_array()) {
throw std::runtime_error("'tools' must be an array of objects");
}
std::vector<json> chatcmpl_tools;
for (json resp_tool : response_body.at("tools")) {
json chatcmpl_tool;

if (json_value(resp_tool, "type", std::string()) != "function") {
throw std::runtime_error("'type' of tool must be 'function'");
}
resp_tool.erase("type");
chatcmpl_tool["type"] = "function";

if (!resp_tool.contains("strict")) {
resp_tool["strict"] = true;
}
chatcmpl_tool["function"] = resp_tool;
chatcmpl_tools.push_back(chatcmpl_tool);
}
chatcmpl_body.erase("tools");
chatcmpl_body["tools"] = chatcmpl_tools;
}

if (response_body.contains("max_output_tokens")) {
chatcmpl_body.erase("max_output_tokens");
chatcmpl_body["max_tokens"] = response_body["max_output_tokens"];
}

return chatcmpl_body;
}

json anthropic_params_from_json(
const struct llama_model* model,
const json& body_in, /* anthropic messages api json semantics */
Expand Down
6 changes: 5 additions & 1 deletion examples/server/server-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,8 @@ json probs_vector_to_json(const llama_context* ctx, const std::vector<completion

bool server_sent_event(httplib::DataSink& sink, const json& data);

bool server_sent_oai_resp_event(httplib::DataSink& sink, const json& data);

bool server_sent_anthropic_event(httplib::DataSink& sink, const json& data);

//
Expand All @@ -246,6 +248,9 @@ json oaicompat_chat_params_parse(
const oaicompat_parser_options& opt,
std::vector<raw_buffer>& out_files);

// convert OpenAI Responses API format to OpenAI Chat Completions API format
json convert_responses_to_chatcmpl(const json& body);

json anthropic_params_from_json(
const struct llama_model* model,
const json& body_in, /* anthropic messages api json semantics */
Expand Down Expand Up @@ -461,4 +466,3 @@ bool prompt_cache_equal(llama_context* ctx, const server_tokens& cache_tokens,
const server_tokens& prompt_tokens, size_t start, const common_prefix& prefix);

std::string safe_json_to_str(const json& data);

Loading