From 02394884e5b0632f11c885738614dad4a166b6aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Victor=20Mu=C5=A1tar?= Date: Wed, 7 Jan 2026 16:40:52 +0100 Subject: [PATCH] fix(agent): use llama.cpp standard chat parser for tool calls Replace the hardcoded XML parser in the agent with llama.cpp's standard chat parser (common_chat_parse), which supports 27+ chat template formats including Hermes 2 Pro, Qwen3-Coder, Llama 3.x, DeepSeek, and others. The agent previously only supported Qwen3-Coder XML format (value). Now it leverages the server-parsed message which handles all formats automatically based on the model's chat template. Key changes: - Remove ~165 lines of custom XML parsing code from agent-loop.cpp - Use res_final->oaicompat_msg from server response instead - Sync chat syntax from task params to result state during streaming - Fix timing issue where state was created before format was determined This enables tool calling with models like NousCoder-14B that use Hermes 2 Pro format ({"name":...}). --- tools/agent/agent-loop.cpp | 181 +++----------------------------- tools/server/server-context.cpp | 14 +-- tools/server/server-task.h | 10 ++ 3 files changed, 30 insertions(+), 175 deletions(-) diff --git a/tools/agent/agent-loop.cpp b/tools/agent/agent-loop.cpp index c22a7170c6c..465aa35a29d 100644 --- a/tools/agent/agent-loop.cpp +++ b/tools/agent/agent-loop.cpp @@ -228,172 +228,6 @@ void agent_loop::clear() { permission_mgr_.clear_session(); } -// Parse a single function block: ...value... -static bool parse_function_block(const std::string & block, common_chat_tool_call & tc) { - // Parse function name: - size_t func_start = block.find("", func_name_start); - if (func_name_end == std::string::npos) return false; - - tc.name = block.substr(func_name_start, func_name_end - func_name_start); - - // Find function block end - size_t func_end = block.find("", func_name_end); - if (func_end == std::string::npos) func_end = block.size(); - - std::string func_body = block.substr(func_name_end + 1, func_end - func_name_end - 1); - - // Parse parameters - json args = json::object(); - size_t param_pos = 0; - while ((param_pos = func_body.find("", param_name_start); - if (param_name_end == std::string::npos) break; - - std::string param_name = func_body.substr(param_name_start, param_name_end - param_name_start); - - // Find parameter value (between > and or next ", value_start); - size_t next_param = func_body.find("... -// ... (without wrapper) -static common_chat_msg parse_tool_calls_xml(const std::string & content) { - common_chat_msg msg; - msg.role = "assistant"; - - std::string remaining = content; - - // First, try to find wrapped format - size_t tool_call_start = remaining.find(""); - // If no , look for bare ......... format - while ((tool_call_start = remaining.find("")) != std::string::npos) { - size_t tool_call_end = remaining.find("", tool_call_start); - if (tool_call_end == std::string::npos) break; - - std::string tool_block = remaining.substr(tool_call_start + 11, tool_call_end - tool_call_start - 11); - remaining = remaining.substr(tool_call_end + 12); - - common_chat_tool_call tc; - tc.id = "call_" + std::to_string(msg.tool_calls.size()); - if (parse_function_block(tool_block, tc)) { - msg.tool_calls.push_back(tc); - } - } - } else { - // Parse bare ... format - while ((func_start = remaining.find("", func_start); - if (func_end == std::string::npos) break; - - std::string func_block = remaining.substr(func_start, func_end - func_start + 11); - remaining = remaining.substr(func_end + 11); - - common_chat_tool_call tc; - tc.id = "call_" + std::to_string(msg.tool_calls.size()); - if (parse_function_block(func_block, tc)) { - msg.tool_calls.push_back(tc); - } - } - } - - return msg; -} - common_chat_msg agent_loop::generate_completion(result_timings & out_timings) { server_response_reader rd = server_ctx_.get_response_reader(); { @@ -479,7 +313,12 @@ common_chat_msg agent_loop::generate_completion(result_timings & out_timings) { auto res_final = dynamic_cast(result.get()); if (res_final) { out_timings = std::move(res_final->timings); - // Use the raw content for our own parsing + // Use the server-parsed message which handles all chat template formats + // (Hermes 2 Pro, Qwen3-Coder, Llama 3.x, DeepSeek, etc.) + if (!res_final->oaicompat_msg.empty()) { + return res_final->oaicompat_msg; + } + // Fallback to raw content if no parsed message if (!res_final->content.empty()) { full_content = res_final->content; } @@ -501,8 +340,12 @@ common_chat_msg agent_loop::generate_completion(result_timings & out_timings) { return msg; } - // Parse tool calls ourselves using the qwen3-coder/nemotron XML format - return parse_tool_calls_xml(full_content); + // Fallback: return content without tool calls + // (Server should have parsed if parse_tool_calls=true, but handle edge cases) + common_chat_msg msg; + msg.role = "assistant"; + msg.content = full_content; + return msg; } tool_result agent_loop::execute_tool_call(const common_chat_tool_call & call) { diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 02417fca1f9..3a646f31cd5 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -1428,6 +1428,7 @@ struct server_context_impl { res->res_type = slot.task->params.res_type; res->oaicompat_model = slot.task->params.oaicompat_model; res->oaicompat_cmpl_id = slot.task->params.oaicompat_cmpl_id; + res->oaicompat_chat_syntax = slot.task->params.oaicompat_chat_syntax; // populate res.probs_output if (slot.task->params.sampling.n_probs > 0) { @@ -1470,12 +1471,13 @@ struct server_context_impl { res->stop = slot.stop; res->post_sampling_probs = slot.task->params.post_sampling_probs; - res->verbose = slot.task->params.verbose; - res->stream = slot.task->params.stream; - res->include_usage = slot.task->params.include_usage; - res->res_type = slot.task->params.res_type; - res->oaicompat_model = slot.task->params.oaicompat_model; - res->oaicompat_cmpl_id = slot.task->params.oaicompat_cmpl_id; + res->verbose = slot.task->params.verbose; + res->stream = slot.task->params.stream; + res->include_usage = slot.task->params.include_usage; + res->res_type = slot.task->params.res_type; + res->oaicompat_model = slot.task->params.oaicompat_model; + res->oaicompat_cmpl_id = slot.task->params.oaicompat_cmpl_id; + res->oaicompat_chat_syntax = slot.task->params.oaicompat_chat_syntax; // populate res.probs_output if (slot.task->params.sampling.n_probs > 0) { diff --git a/tools/server/server-task.h b/tools/server/server-task.h index ead14911821..d225ddb6673 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -293,6 +293,9 @@ struct server_task_result_cmpl_final : server_task_result { std::string oaicompat_cmpl_id; common_chat_msg oaicompat_msg; // to be populated by update() + // Chat syntax for tool call parsing (synced from task params) + common_chat_syntax oaicompat_chat_syntax; + std::vector oaicompat_msg_diffs; // to be populated by update() bool is_updated = false; @@ -304,6 +307,8 @@ struct server_task_result_cmpl_final : server_task_result { virtual void update(task_result_state & state) override { is_updated = true; + // Sync chat syntax from server (may have been updated by tokenize_cli_input) + state.oaicompat_chat_syntax = oaicompat_chat_syntax; oaicompat_msg = state.update_chat_msg(content, false, oaicompat_msg_diffs); } @@ -341,6 +346,9 @@ struct server_task_result_cmpl_partial : server_task_result { std::vector oaicompat_msg_diffs; // to be populated by update() bool is_updated = false; + // Chat syntax for tool call parsing (synced from task params) + common_chat_syntax oaicompat_chat_syntax; + // for Anthropic API: track if any reasoning content has been generated bool anthropic_has_reasoning = false; // Streaming state copied from task_result_state for this chunk @@ -355,6 +363,8 @@ struct server_task_result_cmpl_partial : server_task_result { virtual void update(task_result_state & state) override { is_updated = true; + // Sync chat syntax from server (may have been updated by tokenize_cli_input) + state.oaicompat_chat_syntax = oaicompat_chat_syntax; state.update_chat_msg(content, true, oaicompat_msg_diffs); // track if the accumulated message has any reasoning content anthropic_has_reasoning = !state.chat_msg.reasoning_content.empty();