From 92d0d81a8053a7ba3968a46d27c2e4a66aef7853 Mon Sep 17 00:00:00 2001
From: Trevon <git@trevon.dev>
Date: Sat, 9 Aug 2025 12:45:07 -0400
Subject: [PATCH 1/4] chat-parser : fix GPT-OSS Harmony format end token
 handling

The GPT-OSS model uses OpenAI's Harmony format which includes
an <|end|> token after the final message. The parser wasn't
handling this token properly, causing finish() to throw
'Unexpected content at end of input'.

Modified the GPT-OSS parser to strip the <|end|> token from
the content if present. Added comprehensive tests for GPT-OSS
format with and without the end token to ensure backward
compatibility.
---
 common/chat.cpp            | 10 ++++++-
 tests/test-chat-parser.cpp | 58 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 1 deletion(-)
diff --git a/common/chat.cpp b/common/chat.cpp
index 316bd24170c9e..89e737babaabf 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1322,7 +1322,15 @@ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
     // TODO @ngxson : this won't work with --special enabled, we should fix that
     builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|start|>assistant<|channel|>final<|message|>");
     if (!builder.syntax().parse_tool_calls) {
-        builder.add_content(builder.consume_rest());
+        // First consume everything except potential <|end|> token
+        auto rest = builder.consume_rest();
+        // Check if the rest ends with <|end|> and remove it if present
+        const std::string end_token = "<|end|>";
+        if (rest.size() >= end_token.size() && 
+            rest.substr(rest.size() - end_token.size()) == end_token) {
+            rest = rest.substr(0, rest.size() - end_token.size());
+        }
+        builder.add_content(rest);
         return;
     }
 }
diff --git a/tests/test-chat-parser.cpp b/tests/test-chat-parser.cpp
index 59e44e07d25ed..a83fb9339bbfa 100644
--- a/tests/test-chat-parser.cpp
+++ b/tests/test-chat-parser.cpp
@@ -10,6 +10,7 @@
 #include <string>
 
 #include "chat-parser.h"
+#include "chat.h"
 #include "common.h"
 #include "log.h"
 #include "regex-partial.h"
@@ -341,12 +342,69 @@ static void test_positions() {
   }
 }
 
+static void test_gpt_oss_harmony_format() {
+    // Test GPT-OSS Harmony format with <|end|> token
+    {
+        auto msg = common_chat_parse(
+            "<|channel|>analysis<|message|>Let me think about this...<|start|>assistant<|channel|>final<|message|>The answer is 42<|end|>",
+            /* is_partial= */ false,
+            {
+                /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, // Use AUTO to enable reasoning parsing
+                /* .reasoning_in_content = */ false,
+                /* .thinking_forced_open = */ false,
+                /* .parse_tool_calls = */ false,
+            }
+        );
+        // Should not throw with <|end|> token
+        assert_equals("The answer is 42", msg.content);
+        assert_equals("Let me think about this...", msg.reasoning_content);
+    }
+    
+    // Test without <|end|> token (backward compatibility)
+    {
+        auto msg = common_chat_parse(
+            "<|channel|>analysis<|message|>Thinking...<|start|>assistant<|channel|>final<|message|>Hello world",
+            /* is_partial= */ false,
+            {
+                /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                /* .reasoning_in_content = */ false,
+                /* .thinking_forced_open = */ false,
+                /* .parse_tool_calls = */ false,
+            }
+        );
+        // Should not throw without <|end|> token
+        assert_equals("Hello world", msg.content);
+        assert_equals("Thinking...", msg.reasoning_content);
+    }
+    
+    // Test partial message with <|end|> token
+    {
+        auto msg = common_chat_parse(
+            "<|channel|>analysis<|message|>Processing...<|start|>assistant<|channel|>final<|message|>Partial result<|e",
+            /* is_partial= */ true,
+            {
+                /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                /* .reasoning_in_content = */ false,
+                /* .thinking_forced_open = */ false,
+                /* .parse_tool_calls = */ false,
+            }
+        );
+        // Should not throw for partial message
+        assert_equals("Partial result<|e", msg.content);
+        assert_equals("Processing...", msg.reasoning_content);
+    }
+}
+
 int main() {
     test_positions();
     test_json_with_dumped_args_no_args();
     test_json_with_dumped_args();
     test_reasoning();
     test_regex();
+    test_gpt_oss_harmony_format();
     std::cout << "All tests passed!\n";
     return 0;
 }

From 730c0d9167ef3f577a51aa538a408c988e601302 Mon Sep 17 00:00:00 2001
From: Trevon <git@trevon.dev>
Date: Sat, 9 Aug 2025 13:49:26 -0400
Subject: [PATCH 2/4] Fix server crash when using GPT-OSS models with tools
 that was caused by unconsumed <|end|> tokens in the chat parser.

---
 common/chat.cpp            | 32 ++++++++++++++-------
 common/chat.h              |  1 +
 tests/test-chat-parser.cpp | 58 ++++++++++++++++++++++++++++++++++++--
 tools/server/server.cpp    | 13 +++++++--
 tools/server/utils.hpp     |  1 +
 5 files changed, 91 insertions(+), 14 deletions(-)

diff --git a/common/chat.cpp b/common/chat.cpp
index 89e737babaabf..7996c14df6513 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1313,6 +1313,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
 
     data.prompt = prompt;
     data.format = COMMON_CHAT_FORMAT_GPT_OSS;
+    data.reasoning_format = COMMON_REASONING_FORMAT_AUTO; // GPT-OSS Harmony format always has reasoning channels
 
     // TODO: support tool calls in GPT-OSS?
 
@@ -1321,17 +1322,28 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
 static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
     // TODO @ngxson : this won't work with --special enabled, we should fix that
     builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|start|>assistant<|channel|>final<|message|>");
-    if (!builder.syntax().parse_tool_calls) {
-        // First consume everything except potential <|end|> token
-        auto rest = builder.consume_rest();
-        // Check if the rest ends with <|end|> and remove it if present
-        const std::string end_token = "<|end|>";
-        if (rest.size() >= end_token.size() && 
-            rest.substr(rest.size() - end_token.size()) == end_token) {
-            rest = rest.substr(0, rest.size() - end_token.size());
+    
+    // Check if we have an <|end|> token at the current position or later
+    auto end_pos = builder.input().find("<|end|>", builder.pos());
+    
+    if (end_pos != std::string::npos) {
+        // Content is everything from current position to <|end|>
+        auto content = builder.input().substr(builder.pos(), end_pos - builder.pos());
+        if (!builder.syntax().parse_tool_calls) {
+            builder.add_content(content);
+        }
+        // Move to the <|end|> token and consume it
+        builder.move_to(end_pos);
+        builder.consume_literal("<|end|>");
+    } else {
+        // No <|end|> token, consume everything remaining
+        if (!builder.syntax().parse_tool_calls) {
+            builder.add_content(builder.consume_rest());
+        } else {
+            // Even when parse_tool_calls=true, we need to consume the remaining content
+            // to avoid the "Unexpected content at end of input" error
+            builder.consume_rest();
         }
-        builder.add_content(rest);
-        return;
     }
 }
 
diff --git a/common/chat.h b/common/chat.h
index eb628d8bc275d..ba28f5ba323f9 100644
--- a/common/chat.h
+++ b/common/chat.h
@@ -135,6 +135,7 @@ struct common_chat_templates_inputs {
 
 struct common_chat_params {
     common_chat_format                  format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    common_reasoning_format              reasoning_format = COMMON_REASONING_FORMAT_NONE; // Template-specific reasoning format
     std::string                         prompt;
     std::string                         grammar;
     bool                                grammar_lazy = false;
diff --git a/tests/test-chat-parser.cpp b/tests/test-chat-parser.cpp
index a83fb9339bbfa..76090b538baf8 100644
--- a/tests/test-chat-parser.cpp
+++ b/tests/test-chat-parser.cpp
@@ -343,14 +343,14 @@ static void test_positions() {
 }
 
 static void test_gpt_oss_harmony_format() {
-    // Test GPT-OSS Harmony format with <|end|> token
+    // Test GPT-OSS Harmony format with <|end|> token and AUTO reasoning (parse_tool_calls=false)
     {
         auto msg = common_chat_parse(
             "<|channel|>analysis<|message|>Let me think about this...<|start|>assistant<|channel|>final<|message|>The answer is 42<|end|>",
             /* is_partial= */ false,
             {
                 /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
-                /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, // Use AUTO to enable reasoning parsing
+                /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
                 /* .reasoning_in_content = */ false,
                 /* .thinking_forced_open = */ false,
                 /* .parse_tool_calls = */ false,
@@ -361,6 +361,60 @@ static void test_gpt_oss_harmony_format() {
         assert_equals("Let me think about this...", msg.reasoning_content);
     }
     
+    // Test with parse_tool_calls=true (the default case that was failing)
+    {
+        auto msg = common_chat_parse(
+            "<|channel|>analysis<|message|>Let me think about this...<|start|>assistant<|channel|>final<|message|>The answer is 42<|end|>",
+            /* is_partial= */ false,
+            {
+                /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                /* .reasoning_in_content = */ false,
+                /* .thinking_forced_open = */ false,
+                /* .parse_tool_calls = */ true,  // This was the failing case!
+            }
+        );
+        // Should not throw with <|end|> token even when parse_tool_calls is true
+        assert_equals("", msg.content);  // Content is empty when parse_tool_calls is true - content is consumed but not added
+        assert_equals("Let me think about this...", msg.reasoning_content);
+    }
+    
+    // Test with parse_tool_calls=true and NO <|end|> token (real server scenario)
+    {
+        auto msg = common_chat_parse(
+            "<|channel|>analysis<|message|>Need location info<|start|>assistant<|channel|>final<|message|>Could you specify the location?",
+            /* is_partial= */ false,
+            {
+                /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                /* .reasoning_in_content = */ false,
+                /* .thinking_forced_open = */ false,
+                /* .parse_tool_calls = */ true,  // Server default with tools
+            }
+        );
+        // Should not throw even without <|end|> token when parse_tool_calls is true
+        assert_equals("", msg.content);  // Content is consumed but not added when parse_tool_calls is true
+        assert_equals("Need location info", msg.reasoning_content);
+    }
+    
+    // Test with NONE reasoning format (the actual failing case that was crashing)
+    {
+        auto msg = common_chat_parse(
+            "<|channel|>analysis<|message|>Let me think about this...<|start|>assistant<|channel|>final<|message|>The answer is 42<|end|>",
+            /* is_partial= */ false,
+            {
+                /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE, // Test with NONE - should still work
+                /* .reasoning_in_content = */ false,
+                /* .thinking_forced_open = */ false,
+                /* .parse_tool_calls = */ false,
+            }
+        );
+        // With NONE, reasoning won't be parsed but <|end|> should still be handled
+        assert_equals("<|channel|>analysis<|message|>Let me think about this...<|start|>assistant<|channel|>final<|message|>The answer is 42", msg.content);
+        assert_equals("", msg.reasoning_content);
+    }
+    
     // Test without <|end|> token (backward compatibility)
     {
         auto msg = common_chat_parse(
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index a255d481a4d1c..e0438f75ef750 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -383,8 +383,17 @@ struct server_task {
             } else {
                 params.oaicompat_chat_syntax.format = defaults.oaicompat_chat_syntax.format;
             }
-            params.oaicompat_chat_syntax.reasoning_format = params_base.reasoning_format;
-            params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (params_base.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
+            
+            // Use template's reasoning format if provided, otherwise use CLI default
+            auto reasoning_it = data.find("reasoning_format");
+            if (reasoning_it != data.end()) {
+                params.oaicompat_chat_syntax.reasoning_format = static_cast<common_reasoning_format>(reasoning_it->get<int>());
+                SRV_INF("Reasoning format (from template): %s\n", common_reasoning_format_name(params.oaicompat_chat_syntax.reasoning_format));
+            } else {
+                params.oaicompat_chat_syntax.reasoning_format = params_base.reasoning_format;
+            }
+            
+            params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (params.oaicompat_chat_syntax.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
             params.oaicompat_chat_syntax.thinking_forced_open = json_value(data, "thinking_forced_open", false);
             params.oaicompat_chat_syntax.parse_tool_calls = json_value(data, "parse_tool_calls", false);
         }
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index f3dfc8225da4d..ad039b2a85358 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -804,6 +804,7 @@ static json oaicompat_chat_params_parse(
     }
 
     llama_params["chat_format"]      = static_cast<int>(chat_params.format);
+    llama_params["reasoning_format"] = static_cast<int>(chat_params.reasoning_format); // Pass template's reasoning format
     llama_params["prompt"]           = chat_params.prompt;
     if (!chat_params.grammar.empty()) {
         llama_params["grammar"] = chat_params.grammar;

From 8c45ee747fec4741da4a479b62e5ea4084c7bdac Mon Sep 17 00:00:00 2001
From: Trevon <git@trevon.dev>
Date: Sat, 9 Aug 2025 14:01:22 -0400
Subject: [PATCH 3/4] fix GPT-OSS end token content consumption logic

---
 common/chat.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/common/chat.cpp b/common/chat.cpp
index 7996c14df6513..e5d38efef23a3 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1328,12 +1328,16 @@ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
     
     if (end_pos != std::string::npos) {
         // Content is everything from current position to <|end|>
-        auto content = builder.input().substr(builder.pos(), end_pos - builder.pos());
         if (!builder.syntax().parse_tool_calls) {
+            auto content = builder.input().substr(builder.pos(), end_pos - builder.pos());
             builder.add_content(content);
+            builder.move_to(end_pos);
+        } else {
+            // When parse_tool_calls=true, we still need to consume the content
+            // but don't add it to the result
+            builder.move_to(end_pos);
         }
-        // Move to the <|end|> token and consume it
-        builder.move_to(end_pos);
+        // Consume the <|end|> token
         builder.consume_literal("<|end|>");
     } else {
         // No <|end|> token, consume everything remaining

From 05002b566b7aafcb7f40ce264570fd750eb1df63 Mon Sep 17 00:00:00 2001
From: Trevon <git@trevon.dev>
Date: Sat, 9 Aug 2025 15:26:33 -0400
Subject: [PATCH 4/4] simplify GPT-OSS parser to always return content

---
 common/chat.cpp            | 26 +++++++++-----------------
 tests/test-chat-parser.cpp |  4 ++--
 2 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/common/chat.cpp b/common/chat.cpp
index e5d38efef23a3..73826c2c12013 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1321,6 +1321,8 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
 }
 static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
     // TODO @ngxson : this won't work with --special enabled, we should fix that
+    // TODO: Full Harmony format support with multiple channels (analysis, commentary, final) 
+    //       is being implemented in PR #15181. This is a minimal fix to prevent crashes.
     builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|start|>assistant<|channel|>final<|message|>");
     
     // Check if we have an <|end|> token at the current position or later
@@ -1328,26 +1330,16 @@ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
     
     if (end_pos != std::string::npos) {
         // Content is everything from current position to <|end|>
-        if (!builder.syntax().parse_tool_calls) {
-            auto content = builder.input().substr(builder.pos(), end_pos - builder.pos());
-            builder.add_content(content);
-            builder.move_to(end_pos);
-        } else {
-            // When parse_tool_calls=true, we still need to consume the content
-            // but don't add it to the result
-            builder.move_to(end_pos);
-        }
-        // Consume the <|end|> token
+        auto content = builder.input().substr(builder.pos(), end_pos - builder.pos());
+        builder.add_content(content);
+        builder.move_to(end_pos);
+        // Consume the <|end|> token to prevent "Unexpected content at end of input" errors
+        // Note: <|end|> marks end of message, not end of generation. Full multi-message
+        // support requires proper Harmony channel parsing.
         builder.consume_literal("<|end|>");
     } else {
         // No <|end|> token, consume everything remaining
-        if (!builder.syntax().parse_tool_calls) {
-            builder.add_content(builder.consume_rest());
-        } else {
-            // Even when parse_tool_calls=true, we need to consume the remaining content
-            // to avoid the "Unexpected content at end of input" error
-            builder.consume_rest();
-        }
+        builder.add_content(builder.consume_rest());
     }
 }
 
diff --git a/tests/test-chat-parser.cpp b/tests/test-chat-parser.cpp
index 76090b538baf8..56b953459038f 100644
--- a/tests/test-chat-parser.cpp
+++ b/tests/test-chat-parser.cpp
@@ -375,7 +375,7 @@ static void test_gpt_oss_harmony_format() {
             }
         );
         // Should not throw with <|end|> token even when parse_tool_calls is true
-        assert_equals("", msg.content);  // Content is empty when parse_tool_calls is true - content is consumed but not added
+        assert_equals("The answer is 42", msg.content);  // Content is always added regardless of parse_tool_calls setting
         assert_equals("Let me think about this...", msg.reasoning_content);
     }
     
@@ -393,7 +393,7 @@ static void test_gpt_oss_harmony_format() {
             }
         );
         // Should not throw even without <|end|> token when parse_tool_calls is true
-        assert_equals("", msg.content);  // Content is consumed but not added when parse_tool_calls is true
+        assert_equals("Could you specify the location?", msg.content);  // Content is always added regardless of parse_tool_calls setting
         assert_equals("Need location info", msg.reasoning_content);
     }