From 92d0d81a8053a7ba3968a46d27c2e4a66aef7853 Mon Sep 17 00:00:00 2001 From: Trevon Date: Sat, 9 Aug 2025 12:45:07 -0400 Subject: [PATCH 1/4] chat-parser : fix GPT-OSS Harmony format end token handling The GPT-OSS model uses OpenAI's Harmony format which includes an <|end|> token after the final message. The parser wasn't handling this token properly, causing finish() to throw 'Unexpected content at end of input'. Modified the GPT-OSS parser to strip the <|end|> token from the content if present. Added comprehensive tests for GPT-OSS format with and without the end token to ensure backward compatibility. --- common/chat.cpp | 10 ++++++- tests/test-chat-parser.cpp | 58 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/common/chat.cpp b/common/chat.cpp index 316bd24170c9e..89e737babaabf 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1322,7 +1322,15 @@ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) { // TODO @ngxson : this won't work with --special enabled, we should fix that builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|start|>assistant<|channel|>final<|message|>"); if (!builder.syntax().parse_tool_calls) { - builder.add_content(builder.consume_rest()); + // First consume everything except potential <|end|> token + auto rest = builder.consume_rest(); + // Check if the rest ends with <|end|> and remove it if present + const std::string end_token = "<|end|>"; + if (rest.size() >= end_token.size() && + rest.substr(rest.size() - end_token.size()) == end_token) { + rest = rest.substr(0, rest.size() - end_token.size()); + } + builder.add_content(rest); return; } } diff --git a/tests/test-chat-parser.cpp b/tests/test-chat-parser.cpp index 59e44e07d25ed..a83fb9339bbfa 100644 --- a/tests/test-chat-parser.cpp +++ b/tests/test-chat-parser.cpp @@ -10,6 +10,7 @@ #include #include "chat-parser.h" +#include "chat.h" #include "common.h" #include "log.h" #include "regex-partial.h" @@ -341,12 +342,69 @@ static void test_positions() { } } +static void test_gpt_oss_harmony_format() { + // Test GPT-OSS Harmony format with <|end|> token + { + auto msg = common_chat_parse( + "<|channel|>analysis<|message|>Let me think about this...<|start|>assistant<|channel|>final<|message|>The answer is 42<|end|>", + /* is_partial= */ false, + { + /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS, + /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, // Use AUTO to enable reasoning parsing + /* .reasoning_in_content = */ false, + /* .thinking_forced_open = */ false, + /* .parse_tool_calls = */ false, + } + ); + // Should not throw with <|end|> token + assert_equals("The answer is 42", msg.content); + assert_equals("Let me think about this...", msg.reasoning_content); + } + + // Test without <|end|> token (backward compatibility) + { + auto msg = common_chat_parse( + "<|channel|>analysis<|message|>Thinking...<|start|>assistant<|channel|>final<|message|>Hello world", + /* is_partial= */ false, + { + /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS, + /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, + /* .reasoning_in_content = */ false, + /* .thinking_forced_open = */ false, + /* .parse_tool_calls = */ false, + } + ); + // Should not throw without <|end|> token + assert_equals("Hello world", msg.content); + assert_equals("Thinking...", msg.reasoning_content); + } + + // Test partial message with <|end|> token + { + auto msg = common_chat_parse( + "<|channel|>analysis<|message|>Processing...<|start|>assistant<|channel|>final<|message|>Partial result<|e", + /* is_partial= */ true, + { + /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS, + /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, + /* .reasoning_in_content = */ false, + /* .thinking_forced_open = */ false, + /* .parse_tool_calls = */ false, + } + ); + // Should not throw for partial message + assert_equals("Partial result<|e", msg.content); + assert_equals("Processing...", msg.reasoning_content); + } +} + int main() { test_positions(); test_json_with_dumped_args_no_args(); test_json_with_dumped_args(); test_reasoning(); test_regex(); + test_gpt_oss_harmony_format(); std::cout << "All tests passed!\n"; return 0; } From 730c0d9167ef3f577a51aa538a408c988e601302 Mon Sep 17 00:00:00 2001 From: Trevon Date: Sat, 9 Aug 2025 13:49:26 -0400 Subject: [PATCH 2/4] Fix server crash when using GPT-OSS models with tools that was caused by unconsumed <|end|> tokens in the chat parser. --- common/chat.cpp | 32 ++++++++++++++------- common/chat.h | 1 + tests/test-chat-parser.cpp | 58 ++++++++++++++++++++++++++++++++++++-- tools/server/server.cpp | 13 +++++++-- tools/server/utils.hpp | 1 + 5 files changed, 91 insertions(+), 14 deletions(-) diff --git a/common/chat.cpp b/common/chat.cpp index 89e737babaabf..7996c14df6513 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1313,6 +1313,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp data.prompt = prompt; data.format = COMMON_CHAT_FORMAT_GPT_OSS; + data.reasoning_format = COMMON_REASONING_FORMAT_AUTO; // GPT-OSS Harmony format always has reasoning channels // TODO: support tool calls in GPT-OSS? @@ -1321,17 +1322,28 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) { // TODO @ngxson : this won't work with --special enabled, we should fix that builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|start|>assistant<|channel|>final<|message|>"); - if (!builder.syntax().parse_tool_calls) { - // First consume everything except potential <|end|> token - auto rest = builder.consume_rest(); - // Check if the rest ends with <|end|> and remove it if present - const std::string end_token = "<|end|>"; - if (rest.size() >= end_token.size() && - rest.substr(rest.size() - end_token.size()) == end_token) { - rest = rest.substr(0, rest.size() - end_token.size()); + + // Check if we have an <|end|> token at the current position or later + auto end_pos = builder.input().find("<|end|>", builder.pos()); + + if (end_pos != std::string::npos) { + // Content is everything from current position to <|end|> + auto content = builder.input().substr(builder.pos(), end_pos - builder.pos()); + if (!builder.syntax().parse_tool_calls) { + builder.add_content(content); + } + // Move to the <|end|> token and consume it + builder.move_to(end_pos); + builder.consume_literal("<|end|>"); + } else { + // No <|end|> token, consume everything remaining + if (!builder.syntax().parse_tool_calls) { + builder.add_content(builder.consume_rest()); + } else { + // Even when parse_tool_calls=true, we need to consume the remaining content + // to avoid the "Unexpected content at end of input" error + builder.consume_rest(); } - builder.add_content(rest); - return; } } diff --git a/common/chat.h b/common/chat.h index eb628d8bc275d..ba28f5ba323f9 100644 --- a/common/chat.h +++ b/common/chat.h @@ -135,6 +135,7 @@ struct common_chat_templates_inputs { struct common_chat_params { common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY; + common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; // Template-specific reasoning format std::string prompt; std::string grammar; bool grammar_lazy = false; diff --git a/tests/test-chat-parser.cpp b/tests/test-chat-parser.cpp index a83fb9339bbfa..76090b538baf8 100644 --- a/tests/test-chat-parser.cpp +++ b/tests/test-chat-parser.cpp @@ -343,14 +343,14 @@ static void test_positions() { } static void test_gpt_oss_harmony_format() { - // Test GPT-OSS Harmony format with <|end|> token + // Test GPT-OSS Harmony format with <|end|> token and AUTO reasoning (parse_tool_calls=false) { auto msg = common_chat_parse( "<|channel|>analysis<|message|>Let me think about this...<|start|>assistant<|channel|>final<|message|>The answer is 42<|end|>", /* is_partial= */ false, { /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS, - /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, // Use AUTO to enable reasoning parsing + /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, /* .reasoning_in_content = */ false, /* .thinking_forced_open = */ false, /* .parse_tool_calls = */ false, @@ -361,6 +361,60 @@ static void test_gpt_oss_harmony_format() { assert_equals("Let me think about this...", msg.reasoning_content); } + // Test with parse_tool_calls=true (the default case that was failing) + { + auto msg = common_chat_parse( + "<|channel|>analysis<|message|>Let me think about this...<|start|>assistant<|channel|>final<|message|>The answer is 42<|end|>", + /* is_partial= */ false, + { + /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS, + /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, + /* .reasoning_in_content = */ false, + /* .thinking_forced_open = */ false, + /* .parse_tool_calls = */ true, // This was the failing case! + } + ); + // Should not throw with <|end|> token even when parse_tool_calls is true + assert_equals("", msg.content); // Content is empty when parse_tool_calls is true - content is consumed but not added + assert_equals("Let me think about this...", msg.reasoning_content); + } + + // Test with parse_tool_calls=true and NO <|end|> token (real server scenario) + { + auto msg = common_chat_parse( + "<|channel|>analysis<|message|>Need location info<|start|>assistant<|channel|>final<|message|>Could you specify the location?", + /* is_partial= */ false, + { + /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS, + /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, + /* .reasoning_in_content = */ false, + /* .thinking_forced_open = */ false, + /* .parse_tool_calls = */ true, // Server default with tools + } + ); + // Should not throw even without <|end|> token when parse_tool_calls is true + assert_equals("", msg.content); // Content is consumed but not added when parse_tool_calls is true + assert_equals("Need location info", msg.reasoning_content); + } + + // Test with NONE reasoning format (the actual failing case that was crashing) + { + auto msg = common_chat_parse( + "<|channel|>analysis<|message|>Let me think about this...<|start|>assistant<|channel|>final<|message|>The answer is 42<|end|>", + /* is_partial= */ false, + { + /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS, + /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE, // Test with NONE - should still work + /* .reasoning_in_content = */ false, + /* .thinking_forced_open = */ false, + /* .parse_tool_calls = */ false, + } + ); + // With NONE, reasoning won't be parsed but <|end|> should still be handled + assert_equals("<|channel|>analysis<|message|>Let me think about this...<|start|>assistant<|channel|>final<|message|>The answer is 42", msg.content); + assert_equals("", msg.reasoning_content); + } + // Test without <|end|> token (backward compatibility) { auto msg = common_chat_parse( diff --git a/tools/server/server.cpp b/tools/server/server.cpp index a255d481a4d1c..e0438f75ef750 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -383,8 +383,17 @@ struct server_task { } else { params.oaicompat_chat_syntax.format = defaults.oaicompat_chat_syntax.format; } - params.oaicompat_chat_syntax.reasoning_format = params_base.reasoning_format; - params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (params_base.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY); + + // Use template's reasoning format if provided, otherwise use CLI default + auto reasoning_it = data.find("reasoning_format"); + if (reasoning_it != data.end()) { + params.oaicompat_chat_syntax.reasoning_format = static_cast(reasoning_it->get()); + SRV_INF("Reasoning format (from template): %s\n", common_reasoning_format_name(params.oaicompat_chat_syntax.reasoning_format)); + } else { + params.oaicompat_chat_syntax.reasoning_format = params_base.reasoning_format; + } + + params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (params.oaicompat_chat_syntax.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY); params.oaicompat_chat_syntax.thinking_forced_open = json_value(data, "thinking_forced_open", false); params.oaicompat_chat_syntax.parse_tool_calls = json_value(data, "parse_tool_calls", false); } diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index f3dfc8225da4d..ad039b2a85358 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -804,6 +804,7 @@ static json oaicompat_chat_params_parse( } llama_params["chat_format"] = static_cast(chat_params.format); + llama_params["reasoning_format"] = static_cast(chat_params.reasoning_format); // Pass template's reasoning format llama_params["prompt"] = chat_params.prompt; if (!chat_params.grammar.empty()) { llama_params["grammar"] = chat_params.grammar; From 8c45ee747fec4741da4a479b62e5ea4084c7bdac Mon Sep 17 00:00:00 2001 From: Trevon Date: Sat, 9 Aug 2025 14:01:22 -0400 Subject: [PATCH 3/4] fix GPT-OSS end token content consumption logic --- common/chat.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/common/chat.cpp b/common/chat.cpp index 7996c14df6513..e5d38efef23a3 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1328,12 +1328,16 @@ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) { if (end_pos != std::string::npos) { // Content is everything from current position to <|end|> - auto content = builder.input().substr(builder.pos(), end_pos - builder.pos()); if (!builder.syntax().parse_tool_calls) { + auto content = builder.input().substr(builder.pos(), end_pos - builder.pos()); builder.add_content(content); + builder.move_to(end_pos); + } else { + // When parse_tool_calls=true, we still need to consume the content + // but don't add it to the result + builder.move_to(end_pos); } - // Move to the <|end|> token and consume it - builder.move_to(end_pos); + // Consume the <|end|> token builder.consume_literal("<|end|>"); } else { // No <|end|> token, consume everything remaining From 05002b566b7aafcb7f40ce264570fd750eb1df63 Mon Sep 17 00:00:00 2001 From: Trevon Date: Sat, 9 Aug 2025 15:26:33 -0400 Subject: [PATCH 4/4] simplify GPT-OSS parser to always return content --- common/chat.cpp | 26 +++++++++----------------- tests/test-chat-parser.cpp | 4 ++-- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/common/chat.cpp b/common/chat.cpp index e5d38efef23a3..73826c2c12013 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1321,6 +1321,8 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp } static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) { // TODO @ngxson : this won't work with --special enabled, we should fix that + // TODO: Full Harmony format support with multiple channels (analysis, commentary, final) + // is being implemented in PR #15181. This is a minimal fix to prevent crashes. builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|start|>assistant<|channel|>final<|message|>"); // Check if we have an <|end|> token at the current position or later @@ -1328,26 +1330,16 @@ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) { if (end_pos != std::string::npos) { // Content is everything from current position to <|end|> - if (!builder.syntax().parse_tool_calls) { - auto content = builder.input().substr(builder.pos(), end_pos - builder.pos()); - builder.add_content(content); - builder.move_to(end_pos); - } else { - // When parse_tool_calls=true, we still need to consume the content - // but don't add it to the result - builder.move_to(end_pos); - } - // Consume the <|end|> token + auto content = builder.input().substr(builder.pos(), end_pos - builder.pos()); + builder.add_content(content); + builder.move_to(end_pos); + // Consume the <|end|> token to prevent "Unexpected content at end of input" errors + // Note: <|end|> marks end of message, not end of generation. Full multi-message + // support requires proper Harmony channel parsing. builder.consume_literal("<|end|>"); } else { // No <|end|> token, consume everything remaining - if (!builder.syntax().parse_tool_calls) { - builder.add_content(builder.consume_rest()); - } else { - // Even when parse_tool_calls=true, we need to consume the remaining content - // to avoid the "Unexpected content at end of input" error - builder.consume_rest(); - } + builder.add_content(builder.consume_rest()); } } diff --git a/tests/test-chat-parser.cpp b/tests/test-chat-parser.cpp index 76090b538baf8..56b953459038f 100644 --- a/tests/test-chat-parser.cpp +++ b/tests/test-chat-parser.cpp @@ -375,7 +375,7 @@ static void test_gpt_oss_harmony_format() { } ); // Should not throw with <|end|> token even when parse_tool_calls is true - assert_equals("", msg.content); // Content is empty when parse_tool_calls is true - content is consumed but not added + assert_equals("The answer is 42", msg.content); // Content is always added regardless of parse_tool_calls setting assert_equals("Let me think about this...", msg.reasoning_content); } @@ -393,7 +393,7 @@ static void test_gpt_oss_harmony_format() { } ); // Should not throw even without <|end|> token when parse_tool_calls is true - assert_equals("", msg.content); // Content is consumed but not added when parse_tool_calls is true + assert_equals("Could you specify the location?", msg.content); // Content is always added regardless of parse_tool_calls setting assert_equals("Need location info", msg.reasoning_content); }