Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements/requirements-tool_bench.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ pytest~=8.3.3
huggingface_hub>=0.34.0,<1.0
matplotlib~=3.10.0
numpy~=1.26.4
openai~=1.55.3
openai~=2.14.0
pandas~=2.2.3
prometheus-client~=0.20.0
requests~=2.32.3
Expand Down
45 changes: 44 additions & 1 deletion tools/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Set of LLM REST APIs and a web UI to interact with llama.cpp.

**Features:**
* LLM inference of F16 and quantized models on GPU and CPU
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions, responses, and embeddings routes
* [Anthropic Messages API](https://docs.anthropic.com/en/api/messages) compatible chat completions
* Reranking endpoint (https://github.com/ggml-org/llama.cpp/pull/9510)
* Parallel decoding with multi-user support
Expand Down Expand Up @@ -1267,6 +1267,49 @@ This provides information on the performance of the server. It also allows calcu

The total number of tokens in context is equal to `prompt_n + cache_n + predicted_n`

### POST `/v1/responses`: OpenAI-compatible Responses API

*Options:*

See [OpenAI Responses API documentation](https://platform.openai.com/docs/api-reference/responses).

*Examples:*

You can use either Python `openai` library with appropriate checkpoints:

```python
import openai

client = openai.OpenAI(
base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
api_key = "sk-no-key-required"
)

response = client.responses.create(
model="gpt-4.1",
instructions="You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests.",
input="Write a limerick about python exceptions"
)

print(response.output_text)
```

... or raw HTTP requests:

```shell
curl http://localhost:8080/v1/responses \
-H "Content-Type: application/json" \
-H "Authorization: Bearer no-key" \
-d '{
"model": "gpt-4.1",
"instructions": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests.",
"input": "Write a limerick about python exceptions"
}'
```

This endpoint works by converting Responses request into Chat Completions request.


### POST `/v1/embeddings`: OpenAI-compatible embeddings API

This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm.
Expand Down
294 changes: 294 additions & 0 deletions tools/server/server-common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1069,6 +1069,282 @@ json oaicompat_chat_params_parse(
return llama_params;
}

json convert_responses_to_chatcmpl(const json & response_body) {
if (!response_body.contains("input")) {
throw std::invalid_argument("'input' is required");
}
if (!json_value(response_body, "previous_response_id", std::string{}).empty()) {
throw std::invalid_argument("llama.cpp does not support 'previous_response_id'.");
}

const json input_value = response_body.at("input");
json chatcmpl_body = response_body;
chatcmpl_body.erase("input");
std::vector<json> chatcmpl_messages;

if (response_body.contains("instructions")) {
chatcmpl_messages.push_back({
{"role", "system"},
{"content", json_value(response_body, "instructions", std::string())},
});
chatcmpl_body.erase("instructions");
}

if (input_value.is_string()) {
// #responses_create-input-text_input
chatcmpl_messages.push_back({
{"role", "user"},
{"content", input_value},
});
} else if (input_value.is_array()) {
// #responses_create-input-input_item_list

static auto exists_and_is_array = [](const json & j, const char * key) -> bool {
return j.contains(key) && j.at(key).is_array();
};
static auto exists_and_is_string = [](const json & j, const char * key) -> bool {
return j.contains(key) && j.at(key).is_string();
};

for (json item : input_value) {
if (exists_and_is_string(item, "content")) {
// #responses_create-input-input_item_list-input_message-content-text_input
// Only "Input message" contains item["content"]::string
// After converting item["content"]::string to item["content"]::array,
// we can treat "Input message" as sum of "Item-Input message" and "Item-Output message"
item["content"] = json::array({
json {
{"text", item.at("content")},
{"type", "input_text"}
}
});
}

if (exists_and_is_array(item, "content") &&
exists_and_is_string(item, "role") &&
(item.at("role") == "user" ||
item.at("role") == "system" ||
item.at("role") == "developer")
) {
// #responses_create-input-input_item_list-item-input_message
std::vector<json> chatcmpl_content;

for (const json & input_item : item.at("content")) {
const std::string type = json_value(input_item, "type", std::string());

if (type == "input_text") {
if (!input_item.contains("text")) {
throw std::invalid_argument("'Input text' requires 'text'");
}
chatcmpl_content.push_back({
{"text", input_item.at("text")},
{"type", "text"},
});
} else if (type == "input_image") {
// While `detail` is marked as required,
// it has default value("auto") and can be omitted.

if (!input_item.contains("image_url")) {
throw std::invalid_argument("'image_url' is required");
}
chatcmpl_content.push_back({
{"image_url", json {
{"url", input_item.at("image_url")}
}},
{"type", "image_url"},
});
} else if (type == "input_file") {
if (input_item.contains("file_url")) {
// chat completion API does not support file_url
throw std::invalid_argument("'file_url' is not supported");
}
if (!input_item.contains("file_data") || !input_item.contains("filename")) {
throw std::invalid_argument("Both 'file_data' and 'filename' are required");
}
chatcmpl_content.push_back({
{"file", json {
{"file_data", input_item.at("file_data")},
{"filename", input_item.at("filename")},
}},
{"type", "file"},
});
} else {
throw std::invalid_argument("'type' must be one of 'input_text', 'input_image', or 'input_file'");
}
}

if (item.contains("type")) {
item.erase("type");
}
if (item.contains("status")) {
item.erase("status");
}
item["content"] = chatcmpl_content;

chatcmpl_messages.push_back(item);
} else if (exists_and_is_array(item, "content") &&
exists_and_is_string(item, "role") &&
item.at("role") == "assistant" &&
// exists_and_is_string(item, "status") &&
// (item.at("status") == "in_progress" ||
// item.at("status") == "completed" ||
// item.at("status") == "incomplete") &&
// item["status"] not sent by codex-cli
exists_and_is_string(item, "type") &&
item.at("type") == "message"
) {
// #responses_create-input-input_item_list-item-output_message
std::vector<json> chatcmpl_content;

for (const auto & output_text : item.at("content")) {
const std::string type = json_value(output_text, "type", std::string());
if (type != "output_text") {
throw std::invalid_argument("'type' must be 'output_text'");
}
if (!exists_and_is_string(output_text, "text")) {
throw std::invalid_argument("'Output text' requires 'text'");
}
// Ignore annotations and logprobs for now
chatcmpl_content.push_back({
{"text", output_text.at("text")},
{"type", "text"},
});
}

item.erase("status");
item.erase("type");
item["content"] = chatcmpl_content;
chatcmpl_messages.push_back(item);
} else if (exists_and_is_string(item, "arguments") &&
exists_and_is_string(item, "call_id") &&
exists_and_is_string(item, "name") &&
exists_and_is_string(item, "type") &&
item.at("type") == "function_call"
) {
// #responses_create-input-input_item_list-item-function_tool_call
json msg = json {
{"role", "assistant"},
{"tool_calls", json::array({ json {
{"function", json {
{"arguments", item.at("arguments")},
{"name", item.at("name")},
}},
{"id", item.at("call_id")},
{"type", "function"},
}})},
};

if (!chatcmpl_messages.empty() && chatcmpl_messages.back().contains("reasoning_content")) {
// Move reasoning content from dummy message to tool call message
msg["reasoning_content"] = chatcmpl_messages.back().at("reasoning_content");
chatcmpl_messages.pop_back();
}
chatcmpl_messages.push_back(msg);
} else if (exists_and_is_string(item, "call_id") &&
(exists_and_is_string(item, "output") || exists_and_is_array(item, "output")) &&
exists_and_is_string(item, "type") &&
item.at("type") == "function_call_output"
) {
// #responses_create-input-input_item_list-item-function_tool_call_output
if (item.at("output").is_string()) {
chatcmpl_messages.push_back(json {
{"content", item.at("output")},
{"role", "tool"},
{"tool_call_id", item.at("call_id")},
});
} else {
json chatcmpl_outputs = item.at("output");
for (json & chatcmpl_output : chatcmpl_outputs) {
if (!chatcmpl_output.contains("type") || chatcmpl_output.at("type") != "input_text") {
throw std::invalid_argument("Output of tool call should be 'Input text'");
}
chatcmpl_output["type"] = "text";
}
chatcmpl_messages.push_back(json {
{"content", chatcmpl_outputs},
{"role", "tool"},
{"tool_call_id", item.at("call_id")},
});
}
} else if (// exists_and_is_string(item, "id") &&
// item["id"] not sent by codex-cli
exists_and_is_array(item, "summary") &&
exists_and_is_string(item, "type") &&
item.at("type") == "reasoning") {
// #responses_create-input-input_item_list-item-reasoning

if (!exists_and_is_array(item, "content")) {
throw std::invalid_argument("item['content'] is not an array");
}
if (item.at("content").empty()) {
throw std::invalid_argument("item['content'] is empty");
}
if (!exists_and_is_string(item.at("content")[0], "text")) {
throw std::invalid_argument("item['content']['text'] is not a string");
}

// Pack reasoning content in dummy message
chatcmpl_messages.push_back(json {
{"role", "assistant"},
{"content", json::array()},
{"reasoning_content", item.at("content")[0].at("text")},
});
} else {
throw std::invalid_argument("Cannot determine type of 'item'");
}
}
} else {
throw std::invalid_argument("'input' must be a string or array of objects");
}

// Remove unused dummy message which contains
// reasoning content not followed by tool call
chatcmpl_messages.erase(std::remove_if(
chatcmpl_messages.begin(),
chatcmpl_messages.end(),
[](const json & x){ return x.contains("role") &&
x.at("role") == "assistant" &&
x.contains("content") &&
x.at("content") == json::array() &&
x.contains("reasoning_content");
}),
chatcmpl_messages.end()
);

chatcmpl_body["messages"] = chatcmpl_messages;

if (response_body.contains("tools")) {
if (!response_body.at("tools").is_array()) {
throw std::invalid_argument("'tools' must be an array of objects");
}
std::vector<json> chatcmpl_tools;
for (json resp_tool : response_body.at("tools")) {
json chatcmpl_tool;

if (json_value(resp_tool, "type", std::string()) != "function") {
throw std::invalid_argument("'type' of tool must be 'function'");
}
resp_tool.erase("type");
chatcmpl_tool["type"] = "function";

if (!resp_tool.contains("strict")) {
resp_tool["strict"] = true;
}
chatcmpl_tool["function"] = resp_tool;
chatcmpl_tools.push_back(chatcmpl_tool);
}
chatcmpl_body.erase("tools");
chatcmpl_body["tools"] = chatcmpl_tools;
}

if (response_body.contains("max_output_tokens")) {
chatcmpl_body.erase("max_output_tokens");
chatcmpl_body["max_tokens"] = response_body["max_output_tokens"];
}

return chatcmpl_body;
}

json convert_anthropic_to_oai(const json & body) {
json oai_body;

Expand Down Expand Up @@ -1482,6 +1758,24 @@ std::string format_oai_sse(const json & data) {
return ss.str();
}

std::string format_oai_resp_sse(const json & data) {
std::ostringstream ss;
auto send_single = [&ss](const json & event_obj) {
ss << "event: " << event_obj.at("event").get<std::string>() << "\n";
ss << "data: " << safe_json_to_str(event_obj.at("data")) << "\n\n";
};

if (data.is_array()) {
for (const auto & item : data) {
send_single(item);
}
} else {
send_single(data);
}

return ss.str();
}

std::string format_anthropic_sse(const json & data) {
std::ostringstream ss;

Expand Down
5 changes: 5 additions & 0 deletions tools/server/server-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,9 @@ json oaicompat_chat_params_parse(
const server_chat_params & opt,
std::vector<raw_buffer> & out_files);

// convert OpenAI Responses API format to OpenAI Chat Completions API format
json convert_responses_to_chatcmpl(const json & body);

// convert Anthropic Messages API format to OpenAI Chat Completions API format
json convert_anthropic_to_oai(const json & body);

Expand Down Expand Up @@ -331,6 +334,8 @@ std::string tokens_to_output_formatted_string(const llama_context * ctx, const l
// note: if data is a json array, it will be sent as multiple events, one per item
std::string format_oai_sse(const json & data);

std::string format_oai_resp_sse(const json & data);

// format Anthropic-style SSE with event types
std::string format_anthropic_sse(const json & data);

Expand Down
Loading