From fa154b2258eb90ac66a743c6cd5f989e68017406 Mon Sep 17 00:00:00 2001 From: Alde Rojas Date: Tue, 14 Apr 2026 03:22:17 -0500 Subject: [PATCH 1/2] common : add common_chat_split_by_role --- common/chat.cpp | 46 ++++++++++++++++++++++++++++++++++++++++++++- common/chat.h | 14 ++++++++++++++ tests/test-chat.cpp | 32 +++++++++++++++++++++++++++++++ 3 files changed, 91 insertions(+), 1 deletion(-) diff --git a/common/chat.cpp b/common/chat.cpp index e27b6c3413c9..42ec8f6b551c 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -70,6 +70,42 @@ static bool has_content_or_tool_calls(const common_chat_msg & msg) { return !msg.content.empty() || !msg.tool_calls.empty(); } +std::vector common_chat_split_by_role(const std::string & prompt, const std::vector & delims) { + if (delims.empty() || prompt.empty()) { + return {}; + } + + auto parser = build_peg_parser([&](common_peg_parser_builder & p) { + std::vector all_delims; + std::vector tagged_delims; + + all_delims.reserve(delims.size()); + tagged_delims.reserve(delims.size()); + for (const auto & d : delims) { + all_delims.push_back(d.delimiter); + tagged_delims.push_back(p.tag(d.role, p.literal(d.delimiter))); + } + + auto any_delim = p.until_one_of(all_delims); + return any_delim + p.zero_or_more(p.choice(tagged_delims) + any_delim) + p.end(); + }); + + common_peg_parse_context ctx(prompt); + const auto result = parser.parse(ctx); + if (!result.success()) { + return {}; + } + + std::vector spans; + ctx.ast.visit(result, [&](const common_peg_ast_node & node) { + if (!node.tag.empty()) { + spans.push_back({ node.tag, node.start, node.end - node.start }); + } + }); + + return spans; +} + json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const { if (!content.empty() && !content_parts.empty()) { throw std::runtime_error("Cannot specify both content and content_parts"); @@ -973,7 +1009,15 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp } } - data.prompt = prompt; + data.prompt = prompt; + data.message_spans = common_chat_split_by_role(prompt, { + { "assistant", "<|start|>assistant" }, + { "user", "<|start|>user" }, + { "system", "<|start|>developer" }, + { "system", "<|start|>system" }, + { "tool", "<|start|>functions" }, + }); + data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; data.supports_thinking = true; diff --git a/common/chat.h b/common/chat.h index b06ca37fd742..89697fcc4649 100644 --- a/common/chat.h +++ b/common/chat.h @@ -132,6 +132,17 @@ struct common_chat_msg_diff { } }; +struct common_chat_msg_span { + std::string role; + std::size_t pos = 0; + std::size_t len = 0; +}; + +struct common_chat_msg_delimiter { + std::string role; + std::string delimiter; +}; + struct common_chat_tool { std::string name; std::string description; @@ -187,6 +198,7 @@ struct common_chat_params { std::vector preserved_tokens; std::vector additional_stops; std::string parser; + std::vector message_spans; }; // per-message parsing syntax @@ -275,3 +287,5 @@ std::optional common_chat_try_specialized_template( const common_chat_template & tmpl, const std::string & src, autoparser::generation_params & params); + +std::vector common_chat_split_by_role(const std::string & prompt, const std::vector & delims); diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index 8438a5eaff04..4f4f7fa96b33 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -1473,6 +1473,37 @@ static void test_msgs_oaicompat_json_conversion() { } } +static void test_split_by_role() { + LOG_DBG("%s\n", __func__); + + // Empty inputs + assert_equals(0, common_chat_split_by_role("", {}).size()); + assert_equals(0, common_chat_split_by_role("hello", {}).size()); + assert_equals(0, common_chat_split_by_role("", { { "user", "<|user|>" } }).size()); + + // Multi-role conversation, no leading/trailing content + { + const std::string prompt = "<|user|>Hi<|assistant|>Hello<|user|>Bye"; + const auto splits = common_chat_split_by_role(prompt, { + { "user", "<|user|>" }, + { "assistant", "<|assistant|>" }, + }); + assert_equals(3, splits.size()); + + assert_equals("user", splits[0].role); + assert_equals(0, splits[0].pos); + assert_equals(8, splits[0].len); + + assert_equals("assistant", splits[1].role); + assert_equals(10, splits[1].pos); + assert_equals(13, splits[1].len); + + assert_equals("user", splits[2].role); + assert_equals(28, splits[2].pos); + assert_equals(8, splits[2].len); + } +} + static void test_tools_oaicompat_json_conversion() { LOG_DBG("%s\n", __func__); std::vector tools{ @@ -4168,6 +4199,7 @@ int main(int argc, char ** argv) { { test_msg_diffs_compute(); test_msgs_oaicompat_json_conversion(); + test_split_by_role(); test_tools_oaicompat_json_conversion(); test_developer_role_to_system_workaround(); test_template_output_peg_parsers(detailed_debug); From 1a533340c5308a07f1ec16c29bfb60a5c1132aa6 Mon Sep 17 00:00:00 2001 From: Alde Rojas Date: Tue, 14 Apr 2026 03:46:25 -0500 Subject: [PATCH 2/2] cont : fix spans to reach end of message --- common/chat.cpp | 11 +++++++---- tests/test-chat.cpp | 9 ++++++--- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/common/chat.cpp b/common/chat.cpp index 42ec8f6b551c..24890e9d948e 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -77,17 +77,20 @@ std::vector common_chat_split_by_role(const std::string & auto parser = build_peg_parser([&](common_peg_parser_builder & p) { std::vector all_delims; - std::vector tagged_delims; + std::vector tagged_messages; all_delims.reserve(delims.size()); - tagged_delims.reserve(delims.size()); + tagged_messages.reserve(delims.size()); for (const auto & d : delims) { all_delims.push_back(d.delimiter); - tagged_delims.push_back(p.tag(d.role, p.literal(d.delimiter))); } auto any_delim = p.until_one_of(all_delims); - return any_delim + p.zero_or_more(p.choice(tagged_delims) + any_delim) + p.end(); + for (const auto & d : delims) { + tagged_messages.push_back(p.tag(d.role, p.literal(d.delimiter) + any_delim)); + } + + return any_delim + p.zero_or_more(p.choice(tagged_messages)) + p.end(); }); common_peg_parse_context ctx(prompt); diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index 4f4f7fa96b33..0a48b8e21b24 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -1492,15 +1492,18 @@ static void test_split_by_role() { assert_equals("user", splits[0].role); assert_equals(0, splits[0].pos); - assert_equals(8, splits[0].len); + assert_equals(10, splits[0].len); + assert_equals("<|user|>Hi", prompt.substr(splits[0].pos, splits[0].len)); assert_equals("assistant", splits[1].role); assert_equals(10, splits[1].pos); - assert_equals(13, splits[1].len); + assert_equals(18, splits[1].len); + assert_equals("<|assistant|>Hello", prompt.substr(splits[1].pos, splits[1].len)); assert_equals("user", splits[2].role); assert_equals(28, splits[2].pos); - assert_equals(8, splits[2].len); + assert_equals(11, splits[2].len); + assert_equals("<|user|>Bye", prompt.substr(splits[2].pos, splits[2].len)); } }