ikawrakow · ikawrakow · Feb 14, 2026 · Jan 23, 2026 · Feb 6, 2026
diff --git a/examples/server/README.md b/examples/server/README.md
@@ -6,7 +6,7 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
 
 **Features:**
  * LLM inference of F16 and quantized models on GPU and CPU
- * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
+* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions, responses, and embeddings routes
  * Parallel decoding with multi-user support
  * Continuous batching
  * Multimodal (wip)
@@ -706,6 +706,48 @@ curl http://localhost:8080/v1/chat/completions \
 
 **See our [Function calling](../../docs/function-calling.md) docs** for more details, supported native tool call styles (generic tool call style is used as fallback) / examples of use.
 
+### POST `/v1/responses`: OpenAI-compatible Responses API
+
+*Options:*
+
+See [OpenAI Responses API documentation](https://platform.openai.com/docs/api-reference/responses).
+
+*Examples:*
+
+You can use either Python `openai` library with appropriate checkpoints:
+
+```python
+import openai
+
+client = openai.OpenAI(
+  base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
+  api_key = "sk-no-key-required"
+)
+
+response = client.responses.create(
+  model="gpt-4.1",
+  instructions="You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests.",
+  input="Write a limerick about python exceptions"
+)
+
+print(response.output_text)
+```
+
+... or raw HTTP requests:
+
+```shell
+curl http://localhost:8080/v1/responses \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer no-key" \
+  -d '{
+    "model": "gpt-4.1",
+    "instructions": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests.",
+    "input": "Write a limerick about python exceptions"
+  }'
+```
+
+This endpoint works by converting Responses requests into Chat Completions requests.
+
 ### POST `/v1/embeddings`: OpenAI-compatible embeddings API
 
     *Options:*

diff --git a/examples/server/server-common.cpp b/examples/server/server-common.cpp
@@ -1,5 +1,7 @@
 #include "server-common.h"
 
+#include <algorithm>
+
 using raw_buffer = std::vector<uint8_t>;
 
 
@@ -505,6 +507,30 @@ bool server_sent_event(httplib::DataSink& sink, const json& data) {
     return true;
 }
 
+bool server_sent_oai_resp_event(httplib::DataSink& sink, const json& data) {
+    static auto send_single = [](httplib::DataSink& sink, const json& data) -> bool {
+        const std::string str =
+            "event: " + data.at("event").get<std::string>() + "\n" +
+            "data: " + data.at("data").dump(-1, ' ', false, json::error_handler_t::replace) + "\n\n";
+
+        LOG_DBG("data stream, to_send: %s", str.c_str());
+        return sink.write(str.c_str(), str.size());
+    };
+
+    if (data.is_array()) {
+        for (const auto& item : data) {
+            if (!send_single(sink, item)) {
+                return false;
+            }
+        }
+    }
+    else {
+        return send_single(sink, data);
+    }
+
+    return true;
+}
+
 bool server_sent_anthropic_event(httplib::DataSink& sink, const json& data) {
     static auto send_single = [](httplib::DataSink& sink, const json& data) -> bool {
         const std::string str =
@@ -874,6 +900,250 @@ json oaicompat_chat_params_parse(
     return llama_params;
 }
 
+json convert_responses_to_chatcmpl(const json& response_body) {
+    if (!response_body.contains("input")) {
+        throw std::runtime_error("'input' is required");
+    }
+    if (!json_value(response_body, "previous_response_id", std::string{}).empty()) {
+        throw std::runtime_error("ik_llama.cpp does not support 'previous_response_id'.");
+    }
+
+    const json input_value = response_body.at("input");
+    json chatcmpl_body = response_body;
+    chatcmpl_body.erase("input");
+    std::vector<json> chatcmpl_messages;
+
+    if (response_body.contains("instructions")) {
+        chatcmpl_messages.push_back({
+            {"role",    "system"},
+            {"content", json_value(response_body, "instructions", std::string())},
+        });
+        chatcmpl_body.erase("instructions");
+    }
+
+    if (input_value.is_string()) {
+        chatcmpl_messages.push_back({
+            {"role",    "user"},
+            {"content", input_value},
+        });
+    }
+    else if (input_value.is_array()) {
+        static auto exists_and_is_array = [](const json& j, const char* key) -> bool {
+            return j.contains(key) && j.at(key).is_array();
+        };
+        static auto exists_and_is_string = [](const json& j, const char* key) -> bool {
+            return j.contains(key) && j.at(key).is_string();
+        };
+
+        for (json item : input_value) {
+            if (exists_and_is_string(item, "content")) {
+                item["content"] = json::array({
+                    json{
+                        {"text", item.at("content")},
+                        {"type", "input_text"},
+                    }
+                });
+            }
+
+            if (exists_and_is_array(item, "content") &&
+                exists_and_is_string(item, "role") &&
+                (item.at("role") == "user" || item.at("role") == "system" || item.at("role") == "developer")
+            ) {
+                std::vector<json> chatcmpl_content;
+
+                for (const json& input_item : item.at("content")) {
+                    const std::string type = json_value(input_item, "type", std::string());
+
+                    if (type == "input_text") {
+                        if (!input_item.contains("text")) {
+                            throw std::runtime_error("'Input text' requires 'text'");
+                        }
+                        chatcmpl_content.push_back({
+                            {"text", input_item.at("text")},
+                            {"type", "text"},
+                        });
+                    }
+                    else if (type == "input_image") {
+                        if (!input_item.contains("image_url")) {
+                            throw std::runtime_error("'image_url' is required");
+                        }
+                        chatcmpl_content.push_back({
+                            {"image_url", json{
+                                {"url", input_item.at("image_url")},
+                            }},
+                            {"type", "image_url"},
+                        });
+                    }
+                    else if (type == "input_file") {
+                        throw std::runtime_error("'input_file' is not supported by ik_llama.cpp at this moment");
+                    }
+                    else {
+                        throw std::runtime_error("'type' must be one of 'input_text', 'input_image', or 'input_file'");
+                    }
+                }
+
+                if (item.contains("type")) {
+                    item.erase("type");
+                }
+                if (item.contains("status")) {
+                    item.erase("status");
+                }
+                item["content"] = chatcmpl_content;
+
+                chatcmpl_messages.push_back(item);
+            }
+            else if (exists_and_is_array(item, "content") &&
+                exists_and_is_string(item, "role") &&
+                item.at("role") == "assistant" &&
+                exists_and_is_string(item, "type") &&
+                item.at("type") == "message"
+            ) {
+                std::vector<json> chatcmpl_content;
+
+                for (const auto& output_text : item.at("content")) {
+                    const std::string type = json_value(output_text, "type", std::string());
+                    if (type != "output_text") {
+                        throw std::runtime_error("'type' must be 'output_text'");
+                    }
+                    if (!exists_and_is_string(output_text, "text")) {
+                        throw std::runtime_error("'Output text' requires 'text'");
+                    }
+                    chatcmpl_content.push_back({
+                        {"text", output_text.at("text")},
+                        {"type", "text"},
+                    });
+                }
+
+                item.erase("status");
+                item.erase("type");
+                item["content"] = chatcmpl_content;
+                chatcmpl_messages.push_back(item);
+            }
+            else if (exists_and_is_string(item, "arguments") &&
+                exists_and_is_string(item, "call_id") &&
+                exists_and_is_string(item, "name") &&
+                exists_and_is_string(item, "type") &&
+                item.at("type") == "function_call"
+            ) {
+                json msg = json{
+                    {"role", "assistant"},
+                    {"tool_calls", json::array({json{
+                        {"function", json{
+                            {"arguments", item.at("arguments")},
+                            {"name",      item.at("name")},
+                        }},
+                        {"id",   item.at("call_id")},
+                        {"type", "function"},
+                    }})},
+                };
+
+                if (!chatcmpl_messages.empty() && chatcmpl_messages.back().contains("reasoning_content")) {
+                    msg["reasoning_content"] = chatcmpl_messages.back().at("reasoning_content");
+                    chatcmpl_messages.pop_back();
+                }
+                chatcmpl_messages.push_back(msg);
+            }
+            else if (exists_and_is_string(item, "call_id") &&
+                (exists_and_is_string(item, "output") || exists_and_is_array(item, "output")) &&
+                exists_and_is_string(item, "type") &&
+                item.at("type") == "function_call_output"
+            ) {
+                if (item.at("output").is_string()) {
+                    chatcmpl_messages.push_back(json{
+                        {"content",      item.at("output")},
+                        {"role",         "tool"},
+                        {"tool_call_id", item.at("call_id")},
+                    });
+                }
+                else {
+                    json chatcmpl_outputs = item.at("output");
+                    for (json& chatcmpl_output : chatcmpl_outputs) {
+                        if (!chatcmpl_output.contains("type") || chatcmpl_output.at("type") != "input_text") {
+                            throw std::runtime_error("Output of tool call should be 'Input text'");
+                        }
+                        chatcmpl_output["type"] = "text";
+                    }
+                    chatcmpl_messages.push_back(json{
+                        {"content",      chatcmpl_outputs},
+                        {"role",         "tool"},
+                        {"tool_call_id", item.at("call_id")},
+                    });
+                }
+            }
+            else if (exists_and_is_array(item, "summary") &&
+                exists_and_is_string(item, "type") &&
+                item.at("type") == "reasoning") {
+                if (!exists_and_is_array(item, "content")) {
+                    throw std::runtime_error("item['content'] is not an array");
+                }
+                if (item.at("content").empty()) {
+                    throw std::runtime_error("item['content'] is empty");
+                }
+                if (!exists_and_is_string(item.at("content")[0], "text")) {
+                    throw std::runtime_error("item['content']['text'] is not a string");
+                }
+
+                chatcmpl_messages.push_back(json{
+                    {"role", "assistant"},
+                    {"content", json::array()},
+                    {"reasoning_content", item.at("content")[0].at("text")},
+                });
+            }
+            else {
+                throw std::runtime_error("Cannot determine type of 'item'");
+            }
+        }
+    }
+    else {
+        throw std::runtime_error("'input' must be a string or array of objects");
+    }
+
+    chatcmpl_messages.erase(std::remove_if(
+        chatcmpl_messages.begin(),
+        chatcmpl_messages.end(),
+        [](const json& x) {
+            return x.contains("role") &&
+                x.at("role") == "assistant" &&
+                x.contains("content") &&
+                x.at("content") == json::array() &&
+                x.contains("reasoning_content");
+        }),
+        chatcmpl_messages.end());
+
+    chatcmpl_body["messages"] = chatcmpl_messages;
+
+    if (response_body.contains("tools")) {
+        if (!response_body.at("tools").is_array()) {
+            throw std::runtime_error("'tools' must be an array of objects");
+        }
+        std::vector<json> chatcmpl_tools;
+        for (json resp_tool : response_body.at("tools")) {
+            json chatcmpl_tool;
+
+            if (json_value(resp_tool, "type", std::string()) != "function") {
+                throw std::runtime_error("'type' of tool must be 'function'");
+            }
+            resp_tool.erase("type");
+            chatcmpl_tool["type"] = "function";
+
+            if (!resp_tool.contains("strict")) {
+                resp_tool["strict"] = true;
+            }
+            chatcmpl_tool["function"] = resp_tool;
+            chatcmpl_tools.push_back(chatcmpl_tool);
+        }
+        chatcmpl_body.erase("tools");
+        chatcmpl_body["tools"] = chatcmpl_tools;
+    }
+
+    if (response_body.contains("max_output_tokens")) {
+        chatcmpl_body.erase("max_output_tokens");
+        chatcmpl_body["max_tokens"] = response_body["max_output_tokens"];
+    }
+
+    return chatcmpl_body;
+}
+
 json anthropic_params_from_json(
     const struct llama_model* model,
     const json& body_in, /* anthropic messages api json semantics */

diff --git a/examples/server/server-common.h b/examples/server/server-common.h
@@ -220,6 +220,8 @@ json probs_vector_to_json(const llama_context* ctx, const std::vector<completion
 
 bool server_sent_event(httplib::DataSink& sink, const json& data);
 
+bool server_sent_oai_resp_event(httplib::DataSink& sink, const json& data);
+
 bool server_sent_anthropic_event(httplib::DataSink& sink, const json& data);
 
 //
@@ -246,6 +248,9 @@ json oaicompat_chat_params_parse(
     const oaicompat_parser_options& opt,
     std::vector<raw_buffer>& out_files);
 
+// convert OpenAI Responses API format to OpenAI Chat Completions API format
+json convert_responses_to_chatcmpl(const json& body);
+
 json anthropic_params_from_json(
     const struct llama_model* model,
     const json& body_in, /* anthropic messages api json semantics */
@@ -461,4 +466,3 @@ bool prompt_cache_equal(llama_context* ctx, const server_tokens& cache_tokens,
     const server_tokens& prompt_tokens, size_t start, const common_prefix& prefix);
 
 std::string safe_json_to_str(const json& data);
-