diff --git a/README.md b/README.md index dcfbfb0..cdc8d20 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,9 @@ FetchContent_MakeAvailable(minja) target_link_libraries( PRIVATE minja) ``` -See API in [minja/minja.hpp](./include/minja/minja.hpp) and [minja/chat-template.hpp](./include/minja/chat-template.hpp) (experimental). +See API in [minja/minja.hpp](./include/minja/minja.hpp) and [minja/chat-template.hpp](./include/minja/chat-template.hpp). + +For chat template capabilities detection and automatic polyfills (reasoning formats, tool calls, etc.), see [docs/CAPABILITIES_AND_POLYFILLS.md](./docs/CAPABILITIES_AND_POLYFILLS.md). For raw Jinja templating (see [examples/raw.cpp](./examples/raw.cpp)): @@ -124,7 +126,7 @@ Main limitations (non-exhaustive list): ## Roadmap / TODOs - [ ] Fix known line difference issues on Windows -- [ ] Document the various capabilities detectors + backfill strategies used +- [x] Document the various capabilities detectors + backfill strategies used (see [docs/CAPABILITIES_AND_POLYFILLS.md](./docs/CAPABILITIES_AND_POLYFILLS.md)) - [ ] Propose integration w/ https://github.com/google/gemma.cpp - [x] Integrate to llama.cpp: https://github.com/ggerganov/llama.cpp/pull/11016 + https://github.com/ggerganov/llama.cpp/pull/9639 - Improve fuzzing coverage: diff --git a/docs/CAPABILITIES_AND_POLYFILLS.md b/docs/CAPABILITIES_AND_POLYFILLS.md new file mode 100644 index 0000000..67f1877 --- /dev/null +++ b/docs/CAPABILITIES_AND_POLYFILLS.md @@ -0,0 +1,327 @@ +# Chat Template Capabilities and Polyfills + +Minja automatically detects chat template capabilities and can apply polyfills to normalize differences between templates. This enables applications to use a single canonical message format while supporting a wide variety of model templates. + +## Table of Contents + +- [Capabilities Detection](#capabilities-detection) +- [Reasoning Formats](#reasoning-formats) +- [Automatic Polyfills](#automatic-polyfills) +- [Usage](#usage) +- [Examples](#examples) + +## Capabilities Detection + +When a `chat_template` is constructed, minja probes the template with test messages to detect its capabilities: + +### Core Capabilities + +| Capability | Description | +|------------|-------------| +| `supports_system_role` | Template renders system messages correctly | +| `supports_tools` | Template has native tool/function definition support | +| `supports_tool_calls` | Template renders assistant tool calls | +| `supports_tool_responses` | Template handles tool response messages | +| `supports_tool_call_id` | Template uses tool call IDs for correlation | +| `supports_parallel_tool_calls` | Template can handle multiple tool calls per message | + +### Content Format Capabilities + +| Capability | Description | +|------------|-------------| +| `requires_object_arguments` | Tool call arguments must be objects, not JSON strings | +| `requires_non_null_content` | Content field cannot be null (must be empty string) | +| `requires_typed_content_blocks` | Content must be `[{type: "text", text: ...}]` format | + +### Reasoning Capabilities + +| Capability | Description | +|------------|-------------| +| `supports_reasoning` | Template supports some form of reasoning/thinking | +| `reasoning_format` | Which `ReasoningFormat` the template uses | +| `reasoning_requires_tools` | Reasoning only works with tool_calls present | +| `supports_clear_thinking` | Template respects `clear_thinking` flag for visibility control | +| `supports_reasoning_without_content` | Can emit reasoning with empty content | +| `supports_reasoning_with_content` | Can emit both reasoning and content together | +| `supports_enable_thinking` | Template honors `enable_thinking=false` | + +## Reasoning Formats + +Different models represent chain-of-thought reasoning in different ways. Minja detects and supports six formats: + +### Field-Based Formats + +| Format | Field | Example Models | +|--------|-------|----------------| +| `REASONING_CONTENT_FIELD` | `message.reasoning_content` | Qwen3, GLM-4.6/4.7 | +| `THOUGHT_FIELD` | `message.thought` | MiniCPM3 | +| `THINKING_FIELD` | `message.thinking` | GPT-OSS-120B | +| `TOOL_PLAN_FIELD` | `message.tool_plan` | Command-R7B (requires tool_calls) | + +### Content Block Formats + +| Format | Block Type | Example Models | +|--------|------------|----------------| +| `THINKING_CONTENT_BLOCK` | `{type: "thinking", thinking: ...}` | Ministral, DeepSeek-R1 | +| `THOUGHTS_CONTENT_BLOCK` | `{type: "thoughts", text: ...}` | Apertus, Kimi K2 | + +### Detection Priority + +Formats are detected in priority order: +1. `REASONING_CONTENT_FIELD` (canonical format) +2. `THOUGHT_FIELD` +3. `THINKING_FIELD` +4. `TOOL_PLAN_FIELD` +5. `THINKING_CONTENT_BLOCK` +6. `THOUGHTS_CONTENT_BLOCK` + +## Automatic Polyfills + +When `apply_polyfills` is enabled (default), minja automatically transforms messages to match what each template expects. + +### Reasoning Polyfill + +The canonical input format uses `reasoning_content`: + +```json +{ + "role": "assistant", + "reasoning_content": "Let me think about this...", + "content": "The answer is 42." +} +``` + +This is automatically transformed based on the template's `reasoning_format`: + +**For THOUGHT_FIELD (MiniCPM3):** +```json +{ + "role": "assistant", + "thought": "Let me think about this...", + "content": "The answer is 42." +} +``` + +**For THINKING_CONTENT_BLOCK (Ministral):** +```json +{ + "role": "assistant", + "content": [ + {"type": "thinking", "thinking": "Let me think about this..."}, + {"type": "text", "text": "The answer is 42."} + ] +} +``` + +**For THOUGHTS_CONTENT_BLOCK (Kimi K2):** +```json +{ + "role": "assistant", + "content": [ + {"type": "thoughts", "text": "Let me think about this..."}, + {"type": "text", "text": "The answer is 42."} + ] +} +``` + +### Other Polyfills + +| Polyfill | What it does | +|----------|--------------| +| `polyfill_system_role` | Merges system messages into first user message | +| `polyfill_tools` | Adds tool definitions to system prompt | +| `polyfill_tool_calls` | Converts tool calls to text format | +| `polyfill_tool_responses` | Converts tool role to user with tool_response object | +| `polyfill_object_arguments` | Parses JSON string arguments to objects | +| `polyfill_typed_content` | Converts string content to `[{type: "text", text: ...}]` | + +### Polyfill Options + +All polyfills can be individually controlled via `chat_template_options`: + +```cpp +chat_template_options opts; +opts.apply_polyfills = true; // Master switch (default: true) +opts.polyfill_reasoning = true; // Reasoning format conversion +opts.polyfill_typed_content = true; // String → content blocks +opts.polyfill_system_role = true; // System role emulation +opts.polyfill_tools = true; // Tool definition injection +opts.polyfill_tool_calls = true; // Tool call formatting +opts.polyfill_tool_responses = true; // Tool response formatting +opts.polyfill_object_arguments = true; // Argument parsing +``` + +## Usage + +### C++ API + +```cpp +#include + +// Load template from model's tokenizer_config.json +std::string template_source = /* chat_template field */; +std::string bos_token = ""; +std::string eos_token = ""; + +minja::chat_template tmpl(template_source, bos_token, eos_token); + +// Check capabilities +auto caps = tmpl.original_caps(); +if (caps.supports_reasoning) { + std::cout << "Template supports reasoning format: " + << static_cast(caps.reasoning_format) << std::endl; +} + +// Prepare messages with canonical format +nlohmann::json messages = { + {{"role", "user"}, {"content", "What is 2+2?"}}, + {{"role", "assistant"}, + {"reasoning_content", "Let me calculate: 2+2=4"}, + {"content", "The answer is 4."}} +}; + +// Apply template (polyfills applied automatically) +minja::chat_template_inputs inputs; +inputs.messages = messages; +inputs.add_generation_prompt = true; + +std::string prompt = tmpl.apply(inputs); +``` + +### Checking Specific Capabilities + +```cpp +auto caps = tmpl.original_caps(); + +// Tool support +if (caps.supports_tool_calls && !caps.requires_object_arguments) { + // Can use stringified JSON arguments +} + +// Reasoning support +if (caps.supports_reasoning) { + switch (caps.reasoning_format) { + case minja::ReasoningFormat::REASONING_CONTENT_FIELD: + // Native support, no polyfill needed + break; + case minja::ReasoningFormat::THINKING_CONTENT_BLOCK: + // Will be polyfilled to content blocks + break; + // ... handle other formats + } +} + +// Content format +if (caps.requires_typed_content_blocks) { + // Template expects [{type: "text", text: ...}] format + // Polyfill will convert automatically if enabled +} +``` + +### Disabling Polyfills + +```cpp +minja::chat_template_options opts; +opts.apply_polyfills = false; // Disable all polyfills + +// Or disable specific polyfills +opts.polyfill_reasoning = false; +opts.polyfill_typed_content = false; + +std::string prompt = tmpl.apply(inputs, opts); +``` + +## Examples + +### Example 1: Qwen3 (Native reasoning_content) + +```cpp +// Qwen3 template natively supports reasoning_content +auto caps = tmpl.original_caps(); +// caps.reasoning_format == ReasoningFormat::REASONING_CONTENT_FIELD +// caps.supports_reasoning == true + +// Input (canonical format): +json msg = { + {"role", "assistant"}, + {"reasoning_content", "Thinking..."}, + {"content", "Answer"} +}; +// Output: No transformation needed, template handles reasoning_content directly +``` + +### Example 2: MiniCPM3 (thought field) + +```cpp +// MiniCPM3 uses "thought" field +auto caps = tmpl.original_caps(); +// caps.reasoning_format == ReasoningFormat::THOUGHT_FIELD + +// Input (canonical format): +json msg = { + {"role", "assistant"}, + {"reasoning_content", "Thinking..."}, + {"content", "Answer"} +}; +// After polyfill: +// {"role": "assistant", "thought": "Thinking...", "content": "Answer"} +``` + +### Example 3: Kimi K2 (thoughts content block) + +```cpp +// Kimi K2 uses content blocks with type="thoughts" +auto caps = tmpl.original_caps(); +// caps.reasoning_format == ReasoningFormat::THOUGHTS_CONTENT_BLOCK +// caps.requires_typed_content_blocks == true + +// Input (canonical format): +json msg = { + {"role", "assistant"}, + {"reasoning_content", "Thinking..."}, + {"content", "Answer"} +}; +// After polyfill: +// { +// "role": "assistant", +// "content": [ +// {"type": "thoughts", "text": "Thinking..."}, +// {"type": "text", "text": "Answer"} +// ] +// } +``` + +### Example 4: Command-R7B (tool_plan with tool calls) + +```cpp +// Command-R7B uses tool_plan field, but only with tool_calls +auto caps = tmpl.original_caps(); +// caps.reasoning_format == ReasoningFormat::TOOL_PLAN_FIELD +// caps.reasoning_requires_tools == true + +// Input with tool_calls: +json msg = { + {"role", "assistant"}, + {"reasoning_content", "I need to search for this..."}, + {"content", nullptr}, + {"tool_calls", json::array({...})} +}; +// After polyfill: +// { +// "role": "assistant", +// "tool_plan": "I need to search for this...", +// "content": null, +// "tool_calls": [...] +// } +``` + +## Integration with llama.cpp + +When using minja in llama.cpp, the polyfill system enables: + +1. **Unified Input Format**: Always use `reasoning_content` regardless of model +2. **Automatic Conversion**: Polyfills transform to each model's native format +3. **Simplified Parsing**: Output parsers can focus on canonical format +4. **Capability Queries**: Check what features a model's template supports + +See the [llama.cpp integration branch](https://github.com/ochafik/llama.cpp/tree/sync-minja-reasoning) for implementation details. diff --git a/include/minja/chat-template.hpp b/include/minja/chat-template.hpp index e7bf82b..2a1e988 100644 --- a/include/minja/chat-template.hpp +++ b/include/minja/chat-template.hpp @@ -28,6 +28,17 @@ using json = nlohmann::ordered_json; namespace minja { +// Format used by a template to represent reasoning/thinking content +enum class ReasoningFormat { + NONE, // Template doesn't support reasoning + REASONING_CONTENT_FIELD, // message.reasoning_content field (Qwen3, GLM-4.6/4.7) - canonical format + THINKING_CONTENT_BLOCK, // message.content[].type == "thinking" (Ministral, DeepSeek-R1) + THOUGHTS_CONTENT_BLOCK, // message.content[].type == "thoughts" (Apertus) + THOUGHT_FIELD, // message.thought field (MiniCPM3) + TOOL_PLAN_FIELD, // message.tool_plan field (Command-R7B) + THINKING_FIELD, // message.thinking field (GPT-OSS-120B) +}; + struct chat_template_caps { bool supports_tools = false; bool supports_tool_calls = false; @@ -40,8 +51,23 @@ struct chat_template_caps { bool requires_object_arguments = false; // CohereForAI/c4ai-command-r-plus simple variant bool requires_non_null_content = false; - // MiniMaxAI/MiniMax-Text-01 special - bool requires_typed_content = false; + // Template expects content as typed blocks: [{type: "text", text: ...}] instead of plain string + bool requires_typed_content_blocks = false; + + // Reasoning capabilities (extended thinking / chain-of-thought) + bool supports_reasoning = false; // Template supports some form of reasoning + ReasoningFormat reasoning_format = ReasoningFormat::NONE; + bool reasoning_requires_tools = false; // Reasoning only works when tool_calls present (Command-R7B) + bool reasoning_requires_suffix_position = false; // Reasoning hidden for last non-tool-call assistant (Kimi K2) + + // Reasoning behavior flags (computed via detection probes) + bool supports_reasoning_without_content = false; // Can emit reasoning with empty/null content + bool supports_reasoning_with_content = false; // Can emit both reasoning and content together + bool supports_enable_thinking = false; // Template responds to enable_thinking=false + + // Whether template supports reasoning visibility control (GLM-4.7's clear_thinking flag) + // When clear_thinking=false, all reasoning is shown; when true/default, position-based visibility + bool supports_clear_thinking = false; }; struct chat_template_inputs { @@ -65,6 +91,8 @@ struct chat_template_options { bool polyfill_system_role = true; bool polyfill_object_arguments = true; bool polyfill_typed_content = true; + // Convert reasoning_content to template's native format (thought, thinking, tool_plan) + bool polyfill_reasoning = true; }; class chat_template { @@ -124,16 +152,17 @@ class chat_template { const json dummy_str_user_msg = {{"role", "user"}, {"content", user_needle}}; const json dummy_typed_user_msg = {{"role", "user"}, {"content", json::array({{{"type", "text"}, {"text", user_needle}}})}}; - caps_.requires_typed_content = + caps_.requires_typed_content_blocks = !contains(try_raw_render(json::array({dummy_str_user_msg}), {}, false), user_needle) && contains(try_raw_render(json::array({dummy_typed_user_msg}), {}, false), user_needle); - const auto dummy_user_msg = caps_.requires_typed_content + const auto uses_blocks = caps_.requires_typed_content_blocks; + const auto dummy_user_msg = uses_blocks ? dummy_typed_user_msg : dummy_str_user_msg; const json needle_system_msg = { {"role", "system"}, - {"content", caps_.requires_typed_content ? json::array({{{"type", "text"}, {"text", sys_needle}}}) : json(sys_needle)}, + {"content", uses_blocks ? json::array({{{"type", "text"}, {"text", sys_needle}}}) : json(sys_needle)}, }; caps_.supports_system_role = contains(try_raw_render({needle_system_msg, dummy_user_msg,}, {}, false), sys_needle); @@ -238,6 +267,238 @@ class chat_template { caps_.supports_tool_call_id = contains(out, "call_911_"); } + // Detect thinking / reasoning capabilities + const std::string reasoning_needle = ""; + auto make_assistant_msg = [&](const json & extra_fields, const json & content = json()) { + json msg = {{"role", "assistant"}}; + for (auto & [key, val] : extra_fields.items()) { + msg[key] = val; + } + if (!content.is_null()) { + msg["content"] = content; + } else if (caps_.requires_non_null_content) { + msg["content"] = ""; + } + return msg; + }; + + // Pattern A: reasoning_content field (Qwen3, GLM-4.6/4.7) + // Test both with and without tool_calls to catch position-based templates like Kimi K2 + // that only show reasoning for certain message positions + out = try_raw_render(json::array({ + dummy_user_msg, + make_assistant_msg({{"reasoning_content", reasoning_needle}}), + }), {}, false); + bool supports_reasoning_content = contains(out, reasoning_needle); + bool reasoning_content_requires_tools = false; + // Also test with tool_calls for position-based templates (e.g., Kimi K2) + // that only show reasoning for messages with tool_calls + if (!supports_reasoning_content && caps_.supports_tool_calls) { + auto dummy_args = caps_.requires_object_arguments ? dummy_args_obj : json(dummy_args_obj.dump()); + json reasoning_with_tools_msg = { + {"role", "assistant"}, + {"content", caps_.requires_non_null_content ? "" : json()}, + {"reasoning_content", reasoning_needle}, + {"tool_calls", json::array({make_tool_call("test_tool", dummy_args)})}, + }; + out = try_raw_render(json::array({ + dummy_user_msg, + reasoning_with_tools_msg, + }), {}, false); + supports_reasoning_content = contains(out, reasoning_needle); + if (supports_reasoning_content) { + // Reasoning only works with tool_calls for this template (position-based visibility) + reasoning_content_requires_tools = true; + } + } + + // Pattern D: thought field (MiniCPM3) + out = try_raw_render(json::array({ + dummy_user_msg, + make_assistant_msg({{"thought", reasoning_needle}}, "response"), + }), {}, false); + bool supports_thought_field = contains(out, reasoning_needle); + + // Pattern F: thinking field (GPT-OSS-120B style) + out = try_raw_render(json::array({ + dummy_user_msg, + make_assistant_msg({{"thinking", reasoning_needle}}, "response"), + }), {}, false); + bool supports_thinking_field = contains(out, reasoning_needle); + + // Pattern B: content blocks with type="thinking" (Ministral) + // To detect stringification, we check if the output contains structural markers + // like '"type"' or "'type'" which would appear in serialized JSON/Python + json THINKING_CONTENT_BLOCK_msg = { + {"role", "assistant"}, + {"content", json::array({ + {{"type", "thinking"}, {"thinking", reasoning_needle}}, + {{"type", "text"}, {"text", "response"}} + })} + }; + out = try_raw_render(json::array({dummy_user_msg, THINKING_CONTENT_BLOCK_msg}), {}, false); + // Real support: needle appears but structural markers don't (template extracts content) + // Stringified: needle appears with structural markers (template just serializes the object) + bool supports_THINKING_CONTENT_BLOCK = contains(out, reasoning_needle) + && !contains(out, "\"type\"") && !contains(out, "'type'"); + + // Pattern C: content blocks with type="thoughts" (Apertus) + json THOUGHTS_CONTENT_BLOCK_msg = { + {"role", "assistant"}, + {"content", json::array({ + {{"type", "thoughts"}, {"text", reasoning_needle}}, + {{"type", "text"}, {"text", "response"}} + })} + }; + out = try_raw_render(json::array({dummy_user_msg, THOUGHTS_CONTENT_BLOCK_msg}), {}, false); + bool supports_THOUGHTS_CONTENT_BLOCK = contains(out, reasoning_needle) + && !contains(out, "\"type\"") && !contains(out, "'type'"); + + // Pattern E: tool_plan field (Command-R7B) - requires tool_calls + bool supports_tool_plan_field = false; + if (caps_.supports_tool_calls) { + auto dummy_args = caps_.requires_object_arguments ? dummy_args_obj : json(dummy_args_obj.dump()); + json tool_plan_msg = { + {"role", "assistant"}, + {"content", caps_.requires_non_null_content ? "" : json()}, + {"tool_plan", reasoning_needle}, + {"tool_calls", json::array({make_tool_call("test_tool", dummy_args)})}, + }; + out = try_raw_render(json::array({ + dummy_user_msg, + tool_plan_msg, + }), {}, false); + supports_tool_plan_field = contains(out, reasoning_needle); + } + + // Determine the primary reasoning format (in priority order) + // Field-based patterns are checked first as they are more specific + // Content block patterns are checked last as many templates just stringify unknown content + if (supports_reasoning_content) { + caps_.supports_reasoning = true; + caps_.reasoning_format = ReasoningFormat::REASONING_CONTENT_FIELD; + if (reasoning_content_requires_tools) { + // Position-based templates like Kimi K2 only show reasoning for messages with tool_calls + caps_.reasoning_requires_tools = true; + } + } else if (supports_thought_field) { + caps_.supports_reasoning = true; + caps_.reasoning_format = ReasoningFormat::THOUGHT_FIELD; + } else if (supports_thinking_field) { + caps_.supports_reasoning = true; + caps_.reasoning_format = ReasoningFormat::THINKING_FIELD; + } else if (supports_tool_plan_field) { + caps_.supports_reasoning = true; + caps_.reasoning_format = ReasoningFormat::TOOL_PLAN_FIELD; + caps_.reasoning_requires_tools = true; + } else if (supports_THINKING_CONTENT_BLOCK) { + caps_.supports_reasoning = true; + caps_.reasoning_format = ReasoningFormat::THINKING_CONTENT_BLOCK; + // Note: Don't override requires_typed_content_blocks - it's detected separately. + // Templates using content block reasoning may or may not require typed content for all messages. + } else if (supports_THOUGHTS_CONTENT_BLOCK) { + caps_.supports_reasoning = true; + caps_.reasoning_format = ReasoningFormat::THOUGHTS_CONTENT_BLOCK; + // Note: Don't override requires_typed_content_blocks - it's detected separately. + } + + // Test reasoning visibility control (GLM-4.7's clear_thinking pattern) + // When clear_thinking=false is passed, template should show all reasoning + if (caps_.reasoning_format == ReasoningFormat::REASONING_CONTENT_FIELD) { + // Test with multiple assistant messages and clear_thinking=false + const std::string first_reasoning = ""; + const std::string second_reasoning = ""; + json extra_ctx = {{"clear_thinking", false}}; + out = try_raw_render(json::array({ + dummy_user_msg, + make_assistant_msg({{"reasoning_content", first_reasoning}}, "first"), + dummy_user_msg, + make_assistant_msg({{"reasoning_content", second_reasoning}}, "second"), + }), {}, false, extra_ctx); + // If both reasonings are visible with clear_thinking=false, template supports it + caps_.supports_clear_thinking = contains(out, first_reasoning) && contains(out, second_reasoning); + } + + // Test reasoning behavior flags for templates that support reasoning + if (caps_.supports_reasoning) { + const std::string reasoning_test = ""; + const std::string content_test = ""; + + // Helper to create assistant message with reasoning in the template's native format + auto make_reasoning_msg = [&](const std::string& reasoning, const std::string& content) -> json { + json msg = {{"role", "assistant"}}; + switch (caps_.reasoning_format) { + case ReasoningFormat::REASONING_CONTENT_FIELD: + msg["reasoning_content"] = reasoning; + msg["content"] = content; + break; + case ReasoningFormat::THOUGHT_FIELD: + msg["thought"] = reasoning; + msg["content"] = content; + break; + case ReasoningFormat::THINKING_FIELD: + msg["thinking"] = reasoning; + msg["content"] = content; + break; + case ReasoningFormat::TOOL_PLAN_FIELD: { + // tool_plan requires tool_calls to be present + auto dummy_args = caps_.requires_object_arguments ? dummy_args_obj : json(dummy_args_obj.dump()); + msg["content"] = caps_.requires_non_null_content ? "" : json(); + msg["tool_plan"] = reasoning; + msg["tool_calls"] = json::array({make_tool_call("test_tool", dummy_args)}); + break; + } + case ReasoningFormat::THINKING_CONTENT_BLOCK: + msg["content"] = json::array({ + {{"type", "thinking"}, {"thinking", reasoning}}, + {{"type", "text"}, {"text", content}} + }); + break; + case ReasoningFormat::THOUGHTS_CONTENT_BLOCK: + msg["content"] = json::array({ + {{"type", "thoughts"}, {"text", reasoning}}, + {{"type", "text"}, {"text", content}} + }); + break; + default: + break; + } + return msg; + }; + + // Test supports_reasoning_without_content: can template emit reasoning with empty content? + // Skip for TOOL_PLAN_FIELD since it requires tool_calls which have different semantics + if (caps_.reasoning_format != ReasoningFormat::TOOL_PLAN_FIELD) { + out = try_raw_render(json::array({ + dummy_user_msg, + make_reasoning_msg(reasoning_test, ""), + }), {}, false); + caps_.supports_reasoning_without_content = contains(out, reasoning_test); + } + + // Test supports_reasoning_with_content: can template emit both reasoning and content together? + // Skip for TOOL_PLAN_FIELD since tool calls don't have regular content + if (caps_.reasoning_format != ReasoningFormat::TOOL_PLAN_FIELD) { + out = try_raw_render(json::array({ + dummy_user_msg, + make_reasoning_msg(reasoning_test, content_test), + }), {}, false); + caps_.supports_reasoning_with_content = contains(out, reasoning_test) && contains(out, content_test); + } + + // Test supports_enable_thinking: does template honor enable_thinking=false? + // Only test for REASONING_CONTENT_FIELD format where this flag is commonly used (Qwen3) + if (caps_.reasoning_format == ReasoningFormat::REASONING_CONTENT_FIELD) { + json disable_ctx = {{"enable_thinking", false}}; + out = try_raw_render(json::array({ + dummy_user_msg, + make_reasoning_msg(reasoning_test, content_test), + }), {}, false, disable_ctx); + // If reasoning disappears but content remains when enable_thinking=false, template respects it + caps_.supports_enable_thinking = !contains(out, reasoning_test) && contains(out, content_test); + } + } + try { if (!caps_.supports_tools) { const json user_msg { @@ -342,6 +603,7 @@ class chat_template { auto has_tool_calls = false; auto has_tool_responses = false; auto has_string_content = false; + auto has_reasoning_content = false; for (const auto & message : inputs.messages) { if (message.contains("tool_calls") && !message["tool_calls"].is_null()) { has_tool_calls = true; @@ -352,6 +614,9 @@ class chat_template { if (message.contains("content") && message["content"].is_string()) { has_string_content = true; } + if (message.contains("reasoning_content") && !message["reasoning_content"].is_null()) { + has_reasoning_content = true; + } } auto polyfill_system_role = opts.polyfill_system_role && !caps_.supports_system_role; @@ -360,7 +625,12 @@ class chat_template { auto polyfill_tool_calls = opts.polyfill_tool_calls && has_tool_calls && !caps_.supports_tool_calls; auto polyfill_tool_responses = opts.polyfill_tool_responses && has_tool_responses && !caps_.supports_tool_responses; auto polyfill_object_arguments = opts.polyfill_object_arguments && has_tool_calls && caps_.requires_object_arguments; - auto polyfill_typed_content = opts.polyfill_typed_content && has_string_content && caps_.requires_typed_content; + auto polyfill_typed_content = opts.polyfill_typed_content && has_string_content && caps_.requires_typed_content_blocks; + // Polyfill reasoning_content to template's native format when template supports + // a different reasoning format than REASONING_CONTENT_FIELD (the canonical format) + auto polyfill_reasoning = opts.polyfill_reasoning && has_reasoning_content + && caps_.reasoning_format != ReasoningFormat::NONE + && caps_.reasoning_format != ReasoningFormat::REASONING_CONTENT_FIELD; auto needs_polyfills = opts.apply_polyfills && (false || polyfill_system_role @@ -369,20 +639,30 @@ class chat_template { || polyfill_tool_responses || polyfill_object_arguments || polyfill_typed_content + || polyfill_reasoning ); if (needs_polyfills) { actual_messages = json::array(); + // Helper to build typed content array from string or existing array + auto build_content_array = [](const json & content) -> json { + json content_blocks = json::array(); + if (content.is_string()) { + content_blocks.push_back({{"type", "text"}, {"text", content}}); + } else if (content.is_array()) { + for (const auto & block : content) { + content_blocks.push_back(block); + } + } + return content_blocks; + }; + auto add_message = [&](const json & msg) { if (polyfill_typed_content && msg.contains("content") && !msg.at("content").is_null() && msg.at("content").is_string()) { - actual_messages.push_back({ - {"role", msg.at("role")}, - {"content", {{ - {"type", "text"}, - {"text", msg.at("content")}, - }}}, - }); + auto adjusted = msg; + adjusted["content"] = build_content_array(msg.at("content")); + actual_messages.push_back(adjusted); } else { actual_messages.push_back(msg); } @@ -476,6 +756,56 @@ class chat_template { message.erase("name"); } + // Polyfill reasoning_content to template's native format + if (polyfill_reasoning && message.contains("reasoning_content") && !message["reasoning_content"].is_null()) { + auto reasoning = message["reasoning_content"]; + switch (caps_.reasoning_format) { + case ReasoningFormat::THOUGHT_FIELD: + // MiniCPM3 style: message.thought + message["thought"] = reasoning; + break; + case ReasoningFormat::THINKING_FIELD: + // GPT-OSS-120B style: message.thinking + message["thinking"] = reasoning; + break; + case ReasoningFormat::TOOL_PLAN_FIELD: + // Command-R7B style: message.tool_plan (only with tool_calls) + if (message.contains("tool_calls")) { + message["tool_plan"] = reasoning; + } + break; + case ReasoningFormat::THINKING_CONTENT_BLOCK: + // Ministral style: content blocks with type="thinking" + { + json content_blocks = json::array(); + content_blocks.push_back({{"type", "thinking"}, {"thinking", reasoning}}); + if (message.contains("content") && !message["content"].is_null()) { + for (const auto & block : build_content_array(message["content"])) { + content_blocks.push_back(block); + } + } + message["content"] = content_blocks; + } + break; + case ReasoningFormat::THOUGHTS_CONTENT_BLOCK: + // Apertus style: content blocks with type="thoughts" + { + json content_blocks = json::array(); + content_blocks.push_back({{"type", "thoughts"}, {"text", reasoning}}); + if (message.contains("content") && !message["content"].is_null()) { + for (const auto & block : build_content_array(message["content"])) { + content_blocks.push_back(block); + } + } + message["content"] = content_blocks; + } + break; + default: + break; + } + message.erase("reasoning_content"); + } + if (!message["content"].is_null() && polyfill_system_role) { std::string content = message.at("content"); if (role == "system") { diff --git a/include/minja/minja.hpp b/include/minja/minja.hpp index 943e290..f8cd8f7 100644 --- a/include/minja/minja.hpp +++ b/include/minja/minja.hpp @@ -95,7 +95,7 @@ class Value { } out << string_quote; } - void dump(std::ostringstream & out, int indent = -1, int level = 0, bool to_json = false) const { + void dump(std::ostringstream & out, int indent, int level, bool to_json, const std::string & item_sep, const std::string & key_sep) const { auto print_indent = [&](int level) { if (indent > 0) { out << "\n"; @@ -103,9 +103,11 @@ class Value { } }; auto print_sub_sep = [&]() { - out << ','; - if (indent < 0) out << ' '; - else print_indent(level + 1); + if (indent < 0) out << item_sep; + else { + out << ','; + print_indent(level + 1); + } }; auto string_quote = to_json ? '"' : '\''; @@ -116,7 +118,7 @@ class Value { print_indent(level + 1); for (size_t i = 0; i < array_->size(); ++i) { if (i) print_sub_sep(); - (*array_)[i].dump(out, indent, level + 1, to_json); + (*array_)[i].dump(out, indent, level + 1, to_json, item_sep, key_sep); } print_indent(level); out << "]"; @@ -130,8 +132,8 @@ class Value { } else { out << string_quote << it->first.dump() << string_quote; } - out << ": "; - it->second.dump(out, indent, level + 1, to_json); + out << key_sep; + it->second.dump(out, indent, level + 1, to_json, item_sep, key_sep); } print_indent(level); out << "}"; @@ -447,9 +449,9 @@ class Value { throw std::runtime_error("get not defined for this value type: " + dump()); } - std::string dump(int indent=-1, bool to_json=false) const { + std::string dump(int indent=-1, bool to_json=false, const std::string & item_sep = ", ", const std::string & key_sep = ": ") const { std::ostringstream out; - dump(out, indent, 0, to_json); + dump(out, indent, 0, to_json, item_sep, key_sep); return out.str(); } @@ -2736,8 +2738,17 @@ inline std::shared_ptr Context::builtins() { globals.set("raise_exception", simple_function("raise_exception", { "message" }, [](const std::shared_ptr &, Value & args) -> Value { throw std::runtime_error(args.at("message").get()); })); - globals.set("tojson", simple_function("tojson", { "value", "indent", "ensure_ascii" }, [](const std::shared_ptr &, Value & args) { - return Value(args.at("value").dump(args.get("indent", -1), /* to_json= */ true)); + globals.set("tojson", simple_function("tojson", { "value", "indent", "ensure_ascii", "separators" }, [](const std::shared_ptr &, Value & args) { + std::string item_sep = ", "; + std::string key_sep = ": "; + if (args.contains("separators")) { + const auto & sep = args.at("separators"); + if (sep.is_array() && sep.size() == 2) { + item_sep = sep.at(0).get(); + key_sep = sep.at(1).get(); + } + } + return Value(args.at("value").dump(args.get("indent", -1), /* to_json= */ true, item_sep, key_sep)); })); globals.set("items", simple_function("items", { "object" }, [](const std::shared_ptr &, Value & args) { auto items = Value::array(); diff --git a/scripts/fetch_templates_and_goldens.py b/scripts/fetch_templates_and_goldens.py index 9501cf5..77e370b 100644 --- a/scripts/fetch_templates_and_goldens.py +++ b/scripts/fetch_templates_and_goldens.py @@ -50,8 +50,8 @@ def strftime_now(format): now = datetime.datetime.strptime(TEST_DATE, "%Y-%m-%d") return now.strftime(format) -def tojson(value, indent=None, ensure_ascii=False, sort_keys=False): - return json.dumps(value, indent=indent, ensure_ascii=ensure_ascii, sort_keys=sort_keys) +def tojson(value, indent=None, ensure_ascii=False, sort_keys=False, separators=None): + return json.dumps(value, indent=indent, ensure_ascii=ensure_ascii, sort_keys=sort_keys, separators=separators) def join_cmake_path(parent, child): ''' @@ -73,6 +73,17 @@ def add_system(messages, system_prompt): "content": system_prompt, }) +from enum import Enum + +class ReasoningFormat(Enum): + NONE = "NONE" + REASONING_CONTENT = "REASONING_CONTENT" # message.reasoning_content (Qwen3, GLM-4.6/4.7) - canonical format + CONTENT_BLOCK_THINKING = "CONTENT_BLOCK_THINKING" # content[].type == "thinking" (Ministral) + CONTENT_BLOCK_THOUGHTS = "CONTENT_BLOCK_THOUGHTS" # content[].type == "thoughts" (Apertus) + THOUGHT_FIELD = "THOUGHT_FIELD" # message.thought (MiniCPM3) + TOOL_PLAN_FIELD = "TOOL_PLAN_FIELD" # message.tool_plan (Command-R7B) + THINKING_FIELD = "THINKING_FIELD" # message.thinking (GPT-OSS-120B) + # data class @dataclass class TemplateCaps: @@ -84,7 +95,16 @@ class TemplateCaps: supports_tool_call_id: bool = False requires_object_arguments: bool = False requires_non_null_content: bool = False - requires_typed_content: bool = False + requires_typed_content_blocks: bool = False + # Reasoning capabilities (extended thinking / chain-of-thought) + supports_reasoning: bool = False + reasoning_format: ReasoningFormat = ReasoningFormat.NONE + reasoning_requires_tools: bool = False + # Reasoning behavior flags + supports_reasoning_without_content: bool = False + supports_reasoning_with_content: bool = False + supports_enable_thinking: bool = False + supports_clear_thinking: bool = False def to_json(self): return json.dumps({ @@ -96,7 +116,7 @@ def to_json(self): "supports_tool_call_id": self.supports_tool_call_id, "requires_object_arguments": self.requires_object_arguments, # "requires_non_null_content": self.requires_non_null_content, - "requires_typed_content": self.requires_typed_content, + "requires_typed_content_blocks": self.requires_typed_content_blocks, }, indent=2) @@ -142,12 +162,12 @@ def __init__(self, template, env=None, filters=None, global_functions=None): dummy_str_user_msg = {"role": "user", "content": user_needle } dummy_typed_user_msg = {"role": "user", "content": [{"type": "text", "text": user_needle}]} - caps.requires_typed_content = \ + caps.requires_typed_content_blocks = \ (user_needle not in self.try_raw_render([dummy_str_user_msg])) \ and (user_needle in self.try_raw_render([dummy_typed_user_msg])) - dummy_user_msg = dummy_typed_user_msg if caps.requires_typed_content else dummy_str_user_msg + dummy_user_msg = dummy_typed_user_msg if caps.requires_typed_content_blocks else dummy_str_user_msg - needle_system_msg = {"role": "system", "content": [{"type": "text", "text": sys_needle}] if caps.requires_typed_content else sys_needle} + needle_system_msg = {"role": "system", "content": [{"type": "text", "text": sys_needle}] if caps.requires_typed_content_blocks else sys_needle} caps.supports_system_role = sys_needle in self.try_raw_render([needle_system_msg, dummy_user_msg]) @@ -278,11 +298,197 @@ def make_tool_call(tool_name, arguments): except Exception as e: print(f"Failed to generate tool call example: {e}", file=sys.stderr) + # Detect thinking / reasoning capabilities + reasoning_needle = "" + + def make_assistant_msg(extra_fields, content=None): + msg = {"role": "assistant"} + msg.update(extra_fields) + if content is not None: + msg["content"] = content + elif caps.requires_non_null_content: + msg["content"] = "" + return msg + + # Pattern A: reasoning_content field (Qwen3, GLM-4.6/4.7) + out = self.try_raw_render([ + dummy_user_msg, + make_assistant_msg({"reasoning_content": reasoning_needle}), + ]) + supports_reasoning_content = reasoning_needle in out + + # Pattern D: thought field (MiniCPM3) + out = self.try_raw_render([ + dummy_user_msg, + make_assistant_msg({"thought": reasoning_needle}, "response"), + ]) + supports_thought_field = reasoning_needle in out + + # Pattern F: thinking field (GPT-OSS-120B style) + out = self.try_raw_render([ + dummy_user_msg, + make_assistant_msg({"thinking": reasoning_needle}, "response"), + ]) + supports_reasoning_field = reasoning_needle in out + + # Pattern B: content blocks with type="thinking" (Ministral) + # To detect stringification, we check if the output contains structural markers + # like '"type"' or "'type'" which would appear in serialized JSON/Python + content_block_thinking_msg = { + "role": "assistant", + "content": [ + {"type": "thinking", "thinking": reasoning_needle}, + {"type": "text", "text": "response"} + ] + } + out = self.try_raw_render([dummy_user_msg, content_block_thinking_msg]) + # Real support: needle appears but structural markers don't (template extracts content) + # Stringified: needle appears with structural markers (template just serializes the object) + supports_content_block_thinking = reasoning_needle in out \ + and '"type"' not in out and "'type'" not in out + + # Pattern C: content blocks with type="thoughts" (Apertus) + content_block_thoughts_msg = { + "role": "assistant", + "content": [ + {"type": "thoughts", "text": reasoning_needle}, + {"type": "text", "text": "response"} + ] + } + out = self.try_raw_render([dummy_user_msg, content_block_thoughts_msg]) + supports_content_block_thoughts = reasoning_needle in out \ + and '"type"' not in out and "'type'" not in out + + # Pattern E: tool_plan field (Command-R7B) - requires tool_calls + supports_tool_plan_field = False + if caps.supports_tool_calls: + dummy_args = dummy_args_obj if caps.requires_object_arguments else json.dumps(dummy_args_obj) + tool_plan_msg = { + "role": "assistant", + "content": "" if caps.requires_non_null_content else None, + "tool_plan": reasoning_needle, + "tool_calls": [make_tool_call("test_tool", dummy_args)], + } + out = self.try_raw_render([ + dummy_user_msg, + tool_plan_msg, + ]) + supports_tool_plan_field = reasoning_needle in out + + # Determine the primary reasoning format (in priority order) + # Field-based patterns are checked first as they are more specific + # Content block patterns are checked last as many templates just stringify unknown content + if supports_reasoning_content: + caps.supports_reasoning = True + caps.reasoning_format = ReasoningFormat.REASONING_CONTENT + elif supports_thought_field: + caps.supports_reasoning = True + caps.reasoning_format = ReasoningFormat.THOUGHT_FIELD + elif supports_reasoning_field: + caps.supports_reasoning = True + caps.reasoning_format = ReasoningFormat.THINKING_FIELD + elif supports_tool_plan_field: + caps.supports_reasoning = True + caps.reasoning_format = ReasoningFormat.TOOL_PLAN_FIELD + caps.reasoning_requires_tools = True + elif supports_content_block_thinking: + caps.supports_reasoning = True + caps.reasoning_format = ReasoningFormat.CONTENT_BLOCK_THINKING + elif supports_content_block_thoughts: + caps.supports_reasoning = True + caps.reasoning_format = ReasoningFormat.CONTENT_BLOCK_THOUGHTS + + # Test clear_thinking support (GLM-4.7 pattern) + if caps.reasoning_format == ReasoningFormat.REASONING_CONTENT: + first_reasoning = "" + second_reasoning = "" + out = self.try_raw_render([ + dummy_user_msg, + make_assistant_msg({"reasoning_content": first_reasoning}, "first"), + dummy_user_msg, + make_assistant_msg({"reasoning_content": second_reasoning}, "second"), + ], extra_context={"clear_thinking": False}) + caps.supports_clear_thinking = first_reasoning in out and second_reasoning in out + + # Test reasoning behavior flags for templates that support reasoning + if caps.supports_reasoning: + reasoning_test = "" + content_test = "" + + # Helper to create assistant message with reasoning in the template's native format + def make_reasoning_msg(reasoning: str, content: str) -> dict: + fmt = caps.reasoning_format + if fmt == ReasoningFormat.REASONING_CONTENT: + return {"role": "assistant", "reasoning_content": reasoning, "content": content} + elif fmt == ReasoningFormat.THOUGHT_FIELD: + return {"role": "assistant", "thought": reasoning, "content": content} + elif fmt == ReasoningFormat.THINKING_FIELD: + return {"role": "assistant", "thinking": reasoning, "content": content} + elif fmt == ReasoningFormat.TOOL_PLAN_FIELD: + dummy_args = dummy_args_obj if caps.requires_object_arguments else json.dumps(dummy_args_obj) + return { + "role": "assistant", + "content": "" if caps.requires_non_null_content else None, + "tool_plan": reasoning, + "tool_calls": [make_tool_call("test_tool", dummy_args)] + } + elif fmt == ReasoningFormat.CONTENT_BLOCK_THINKING: + return { + "role": "assistant", + "content": [ + {"type": "thinking", "thinking": reasoning}, + {"type": "text", "text": content} + ] + } + elif fmt == ReasoningFormat.CONTENT_BLOCK_THOUGHTS: + return { + "role": "assistant", + "content": [ + {"type": "thoughts", "text": reasoning}, + {"type": "text", "text": content} + ] + } + return {"role": "assistant", "content": content} + + # Test supports_reasoning_without_content: can template emit reasoning with empty content? + # Skip for TOOL_PLAN_FIELD since it requires tool_calls which have different semantics + if caps.reasoning_format != ReasoningFormat.TOOL_PLAN_FIELD: + out = self.try_raw_render([dummy_user_msg, make_reasoning_msg(reasoning_test, "")]) + caps.supports_reasoning_without_content = reasoning_test in out + + # Test supports_reasoning_with_content: can template emit both reasoning and content together? + # Skip for TOOL_PLAN_FIELD since tool calls don't have regular content + if caps.reasoning_format != ReasoningFormat.TOOL_PLAN_FIELD: + out = self.try_raw_render([dummy_user_msg, make_reasoning_msg(reasoning_test, content_test)]) + caps.supports_reasoning_with_content = reasoning_test in out and content_test in out + + # Test supports_enable_thinking: does template honor enable_thinking=false? + # Only test for REASONING_CONTENT format where this flag is commonly used (Qwen3) + if caps.reasoning_format == ReasoningFormat.REASONING_CONTENT: + out = self.try_raw_render( + [dummy_user_msg, make_reasoning_msg(reasoning_test, content_test)], + extra_context={"enable_thinking": False} + ) + # If reasoning disappears but content remains when enable_thinking=false, template respects it + caps.supports_enable_thinking = reasoning_test not in out and content_test in out + self.original_caps = caps def needs_polyfills(self, context): has_tools = context.get('tools') is not None caps = self.original_caps + + # Check if any message has reasoning_content that needs polyfilling + has_reasoning_content = any( + msg.get('reasoning_content') is not None + for msg in context.get('messages', []) + ) + # Polyfill reasoning_content to template's native format when template supports + # a different reasoning format than REASONING_CONTENT (the canonical format) + needs_reasoning_polyfill = has_reasoning_content \ + and caps.reasoning_format != ReasoningFormat.NONE \ + and caps.reasoning_format != ReasoningFormat.REASONING_CONTENT + return not caps.supports_system_role \ or (has_tools is not None and (False \ or not caps.supports_tools \ @@ -290,7 +496,8 @@ def needs_polyfills(self, context): or not caps.supports_tool_calls \ or caps.requires_object_arguments \ )) \ - or caps.requires_typed_content + or caps.requires_typed_content_blocks \ + or needs_reasoning_polyfill def apply(self, context: dict): assert isinstance(context, dict) @@ -340,7 +547,50 @@ def apply(self, context: dict): }, indent=2) del message['name'] - if caps.requires_typed_content: + # Polyfill reasoning_content to template's native format + should_polyfill_reasoning = caps.reasoning_format not in ( + ReasoningFormat.NONE, + ReasoningFormat.REASONING_CONTENT, + ) + if should_polyfill_reasoning and 'reasoning_content' in message and message['reasoning_content'] is not None: + reasoning = message['reasoning_content'] + if caps.reasoning_format == ReasoningFormat.THOUGHT_FIELD: + # MiniCPM3 style: message.thought + message['thought'] = reasoning + del message['reasoning_content'] + elif caps.reasoning_format == ReasoningFormat.THINKING_FIELD: + # GPT-OSS-120B style: message.thinking + message['thinking'] = reasoning + del message['reasoning_content'] + elif caps.reasoning_format == ReasoningFormat.TOOL_PLAN_FIELD: + # Command-R7B style: message.tool_plan (only with tool_calls) + if 'tool_calls' in message: + message['tool_plan'] = reasoning + del message['reasoning_content'] + elif caps.reasoning_format == ReasoningFormat.CONTENT_BLOCK_THINKING: + # Ministral style: content blocks with type="thinking" + content_blocks = [{"type": "thinking", "thinking": reasoning}] + original_content = message.get('content') + if original_content is not None: + if isinstance(original_content, str): + content_blocks.append({"type": "text", "text": original_content}) + elif isinstance(original_content, list): + content_blocks.extend(original_content) + message['content'] = content_blocks + del message['reasoning_content'] + elif caps.reasoning_format == ReasoningFormat.CONTENT_BLOCK_THOUGHTS: + # Apertus style: content blocks with type="thoughts" + content_blocks = [{"type": "thoughts", "text": reasoning}] + original_content = message.get('content') + if original_content is not None: + if isinstance(original_content, str): + content_blocks.append({"type": "text", "text": original_content}) + elif isinstance(original_content, list): + content_blocks.extend(original_content) + message['content'] = content_blocks + del message['reasoning_content'] + + if caps.requires_typed_content_blocks: for message in context['messages']: if 'content' in message and isinstance(message['content'], str): message['content'] = [{"type": "text", "text": message['content']}] diff --git a/scripts/render.py b/scripts/render.py index 0de5d45..68acba4 100644 --- a/scripts/render.py +++ b/scripts/render.py @@ -11,11 +11,15 @@ import jinja2.ext from pathlib import Path +def tojson(value, indent=None, ensure_ascii=False, sort_keys=False, separators=None): + return json.dumps(value, indent=indent, ensure_ascii=ensure_ascii, sort_keys=sort_keys, separators=separators) + input_file, output_file = sys.argv[1:3] data = json.loads(Path(input_file).read_text()) # print(json.dumps(data, indent=2), file=sys.stderr) env = Environment(**data['options'], extensions=[jinja2.ext.loopcontrols]) +env.filters['tojson'] = tojson tmpl = env.from_string(data['template']) output = tmpl.render(data['bindings']) Path(output_file).write_text(output) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index c2e9ed2..27eba25 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -143,12 +143,14 @@ set(MODEL_IDS deepseek-ai/DeepSeek-V2-Lite deepseek-ai/DeepSeek-V2.5 deepseek-ai/DeepSeek-V3 + deepseek-ai/DeepSeek-V3.1 # deepseek-ai/DeepSeek-V3.2 # No Jinja template; see synthetic below deepseek-ai/deepseek-coder-7b-instruct-v1.5 dicta-il/dictalm2.0-instruct ehristoforu/Falcon3-8B-Franken-Basestruct google/gemma-7b-it ibm-granite/granite-3.1-8b-instruct + ibm-granite/granite-3.3-2b-instruct inclusionAI/Ling-Coder-lite indischepartij/MiniCPM-3B-OpenHermes-2.5-v2 jinaai/ReaderLM-v2 @@ -156,6 +158,7 @@ set(MODEL_IDS llava-hf/llava-1.5-7b-hf meetkai/functionary-medium-v3.1 meetkai/functionary-medium-v3.2 + moonshotai/Kimi-K2-Instruct meta-llama/Llama-2-7b-chat-hf meta-llama/Llama-3.1-8B-Instruct meta-llama/Llama-3.2-3B-Instruct @@ -195,6 +198,7 @@ set(MODEL_IDS upstage/solar-pro-preview-instruct xwen-team/Xwen-7B-Chat zai-org/GLM-4.6 + zai-org/GLM-4.7 # Synthetic templates for models without Jinja templates ${CMAKE_CURRENT_SOURCE_DIR}/synthetic-deepseek-v3.2-dsml.jinja diff --git a/tests/contexts/reasoning_clear_thinking.json b/tests/contexts/reasoning_clear_thinking.json new file mode 100644 index 0000000..00b9c2b --- /dev/null +++ b/tests/contexts/reasoning_clear_thinking.json @@ -0,0 +1,31 @@ +{ + "messages": [ + { + "role": "user", + "content": "What is 2+2?" + }, + { + "role": "assistant", + "reasoning_content": "Simple \"arithmetic\": 2+2=4", + "content": "It's \"4\"." + }, + { + "role": "user", + "content": "And 3+3?" + }, + { + "role": "assistant", + "reasoning_content": "Similarly: 3+3=6", + "content": "6" + } + ], + "add_generation_prompt": true, + "clear_thinking": false, + "bos_token": "<|startoftext|>", + "eos_token": "<|endoftext|>", + "_test_metadata": { + "_comment": "clear_thinking=false should show ALL reasoning. Quote in reasoning_content and content tests non-stringification.", + "expected_strings_if_supports_reasoning": ["Simple \"arithmetic\": 2+2=4", "Similarly: 3+3=6"], + "forbidden_strings": ["\"reasoning_content\"", "\\\"arithmetic\\\"", "\\\"4\\\""] + } +} diff --git a/tests/contexts/reasoning_disabled.json b/tests/contexts/reasoning_disabled.json new file mode 100644 index 0000000..05e89fd --- /dev/null +++ b/tests/contexts/reasoning_disabled.json @@ -0,0 +1,21 @@ +{ + "messages": [ + { + "role": "user", + "content": "Quick answer: what is 2+2?" + }, + { + "role": "assistant", + "content": "It's \"4\"." + } + ], + "add_generation_prompt": true, + "enable_thinking": false, + "bos_token": "<|startoftext|>", + "eos_token": "<|endoftext|>", + "_test_metadata": { + "_comment": "enable_thinking=false disables thinking mode. Quote in content tests non-stringification.", + "expected_strings": ["Quick answer: what is 2+2?", "It's \"4\"."], + "forbidden_strings": ["\"reasoning_content\"", "\\\"4\\\""] + } +} diff --git a/tests/contexts/reasoning_multi_turn.json b/tests/contexts/reasoning_multi_turn.json new file mode 100644 index 0000000..6c98ae5 --- /dev/null +++ b/tests/contexts/reasoning_multi_turn.json @@ -0,0 +1,39 @@ +{ + "messages": [ + { + "role": "user", + "content": "Let's solve a puzzle step by step" + }, + { + "role": "assistant", + "reasoning_content": "This is a multi-step problem. Let me break it down.", + "content": "Sure, let's work through it together." + }, + { + "role": "user", + "content": "First clue: the number is even" + }, + { + "role": "assistant", + "reasoning_content": "An even number... that narrows it to 2, 4, 6, 8...", + "content": "Noted. What's the next clue?" + }, + { + "role": "user", + "content": "It's less than 5" + }, + { + "role": "assistant", + "reasoning_content": "Even and less than 5 means it's \"either\" 2 or 4.", + "content": "The number must be \"2 or 4\"!" + } + ], + "add_generation_prompt": true, + "bos_token": "<|startoftext|>", + "eos_token": "<|endoftext|>", + "_test_metadata": { + "_comment": "Multi-turn reasoning. Quote in reasoning_content and content tests non-stringification.", + "expected_strings_if_supports_reasoning": ["Even and less than 5 means it's \"either\" 2 or 4.", "The number must be \"2 or 4\"!"], + "forbidden_strings": ["\"reasoning_content\"", "\\\"either\\\"", "\\\"2 or 4\\\""] + } +} diff --git a/tests/contexts/reasoning_only.json b/tests/contexts/reasoning_only.json new file mode 100644 index 0000000..60d3fdb --- /dev/null +++ b/tests/contexts/reasoning_only.json @@ -0,0 +1,21 @@ +{ + "messages": [ + { + "role": "user", + "content": "What is 2+2?" + }, + { + "role": "assistant", + "reasoning_content": "Let me calculate: 2+2 equals 4.", + "content": "The answer is \"four\"." + } + ], + "add_generation_prompt": true, + "bos_token": "<|startoftext|>", + "eos_token": "<|endoftext|>", + "_test_metadata": { + "_comment": "For templates with supports_reasoning=true, reasoning should appear in output. Quote in content tests for non-stringification.", + "expected_strings_if_supports_reasoning": ["Let me calculate: 2+2 equals 4.", "The answer is \"four\"."], + "forbidden_strings": ["\"reasoning_content\"", "\\\"four\\\""] + } +} diff --git a/tests/contexts/reasoning_position_based.json b/tests/contexts/reasoning_position_based.json new file mode 100644 index 0000000..e8f89d7 --- /dev/null +++ b/tests/contexts/reasoning_position_based.json @@ -0,0 +1,30 @@ +{ + "messages": [ + { + "role": "user", + "content": "What is 2+2?" + }, + { + "role": "assistant", + "reasoning_content": "Simple arithmetic: 2+2=4", + "content": "4" + }, + { + "role": "user", + "content": "And 3+3?" + }, + { + "role": "assistant", + "reasoning_content": "Similarly: \"3+3\"=6", + "content": "It's \"6\"." + } + ], + "add_generation_prompt": true, + "bos_token": "<|startoftext|>", + "eos_token": "<|endoftext|>", + "_test_metadata": { + "_comment": "Position-based: only last reasoning shown. Quote in reasoning_content and content tests non-stringification.", + "expected_strings_if_supports_reasoning": ["Similarly: \"3+3\"=6", "It's \"6\"."], + "forbidden_strings": ["\"reasoning_content\"", "\\\"3+3\\\"", "\\\"6\\\""] + } +} diff --git a/tests/contexts/reasoning_with_tools.json b/tests/contexts/reasoning_with_tools.json new file mode 100644 index 0000000..4d5b336 --- /dev/null +++ b/tests/contexts/reasoning_with_tools.json @@ -0,0 +1,61 @@ +{ + "messages": [ + { + "role": "user", + "content": "Calculate 15% tip on $50" + }, + { + "role": "assistant", + "reasoning_content": "I need to calculate \"15%\" of $50. Let me use the calculator tool.", + "content": "", + "tool_calls": [ + { + "id": "call_1___", + "type": "function", + "function": { + "name": "calculator", + "arguments": "{\"expression\": \"50 * 0.15\"}" + } + } + ] + }, + { + "role": "tool", + "tool_call_id": "call_1___", + "name": "calculator", + "content": "7.5" + }, + { + "role": "assistant", + "reasoning_content": "The calculation returned 7.5, so the tip is $7.50.", + "content": "A 15% tip on $50 is \"$7.50\"." + } + ], + "add_generation_prompt": true, + "bos_token": "<|startoftext|>", + "eos_token": "<|endoftext|>", + "tools": [ + { + "type": "function", + "function": { + "name": "calculator", + "description": "Evaluate a mathematical expression", + "parameters": { + "type": "object", + "properties": { + "expression": { + "type": "string", + "description": "The mathematical expression to evaluate." + } + }, + "required": ["expression"] + } + } + } + ], + "_test_metadata": { + "_comment": "Reasoning with tool calls. Quote in reasoning_content and content tests non-stringification.", + "expected_strings_if_supports_reasoning": ["I need to calculate \"15%\" of $50", "A 15% tip on $50 is \"$7.50\"."], + "forbidden_strings": ["\"reasoning_content\"", "\\\"15%\\\"", "\\\"$7.50\\\""] + } +} diff --git a/tests/contexts/simple.json b/tests/contexts/simple.json index 5e89f22..e158995 100644 --- a/tests/contexts/simple.json +++ b/tests/contexts/simple.json @@ -6,11 +6,16 @@ }, { "role": "assistant", - "content": "llama.cpp!" + "content": "I'd say \"llama.cpp\"!" } ], "add_generation_prompt": true, "bos_token": "<|startoftext|>", "eos_token": "<|endoftext|>", - "tools_in_user_message": false + "tools_in_user_message": false, + "_test_metadata": { + "_comment": "Basic conversation without tools or system message. Quote in content tests non-stringification.", + "expected_strings": ["What's your favourite LLM framework?", "I'd say \"llama.cpp\"!"], + "forbidden_strings": ["\\\"llama.cpp\\\""] + } } diff --git a/tests/contexts/system.json b/tests/contexts/system.json index 7cbc5c2..7cef6a6 100644 --- a/tests/contexts/system.json +++ b/tests/contexts/system.json @@ -2,7 +2,7 @@ "messages": [ { "role": "system", - "content": "You only tell the truth." + "content": "You only tell \"the truth\"." }, { "role": "user", @@ -16,5 +16,10 @@ "add_generation_prompt": true, "bos_token": "<|startoftext|>", "eos_token": "<|endoftext|>", - "tools_in_user_message": false + "tools_in_user_message": false, + "_test_metadata": { + "_comment": "Conversation with system message. Quote in system content tests non-stringification.", + "expected_strings": ["What's your favourite LLM framework?", "llama.cpp!"], + "forbidden_strings": ["\\\"the truth\\\""] + } } diff --git a/tests/contexts/tool_plan_reasoning.json b/tests/contexts/tool_plan_reasoning.json new file mode 100644 index 0000000..51c77d2 --- /dev/null +++ b/tests/contexts/tool_plan_reasoning.json @@ -0,0 +1,100 @@ +{ + "messages": [ + { + "role": "user", + "content": "What's the weather in Paris and convert it to Fahrenheit?" + }, + { + "role": "assistant", + "reasoning_content": "I need to first get the weather in \"Paris\", then convert the temperature.", + "content": "", + "tool_calls": [ + { + "id": "call_1___", + "type": "function", + "function": { + "name": "get_weather", + "arguments": "{\"city\": \"Paris\"}" + } + } + ] + }, + { + "role": "tool", + "tool_call_id": "call_1___", + "name": "get_weather", + "content": "{\"temperature\": 20, \"unit\": \"celsius\", \"condition\": \"sunny\"}" + }, + { + "role": "assistant", + "reasoning_content": "Got 20°C. Now I need to convert: F = C * 9/5 + 32 = 20 * 1.8 + 32 = 68°F", + "content": "", + "tool_calls": [ + { + "id": "call_2___", + "type": "function", + "function": { + "name": "convert_temperature", + "arguments": "{\"celsius\": 20}" + } + } + ] + }, + { + "role": "tool", + "tool_call_id": "call_2___", + "name": "convert_temperature", + "content": "{\"fahrenheit\": 68}" + }, + { + "role": "assistant", + "content": "The weather in Paris is sunny at \"twenty\" degrees (68°F)." + } + ], + "add_generation_prompt": true, + "bos_token": "<|startoftext|>", + "eos_token": "<|endoftext|>", + "tools": [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a city", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": "The city name" + } + }, + "required": ["city"] + } + } + }, + { + "type": "function", + "function": { + "name": "convert_temperature", + "description": "Convert Celsius to Fahrenheit", + "parameters": { + "type": "object", + "properties": { + "celsius": { + "type": "number", + "description": "Temperature in Celsius" + } + }, + "required": ["celsius"] + } + } + } + ], + "_test_metadata": { + "_comment": "Multi-step tool use with reasoning. Quote in reasoning_content and content tests non-stringification.", + "expected_strings": ["The weather in Paris is sunny at \"twenty\" degrees (68°F)."], + "expected_strings_if_supports_tool_calls": ["get_weather", "convert_temperature"], + "expected_strings_if_supports_reasoning": ["I need to first get the weather in \"Paris\"", "convert: F = C * 9/5 + 32"], + "forbidden_strings": ["\"reasoning_content\"", "\"tool_plan\"", "\\\"Paris\\\"", "\\\"twenty\\\""] + } +} diff --git a/tests/contexts/tool_use.json b/tests/contexts/tool_use.json index cca70cb..5c09881 100644 --- a/tests/contexts/tool_use.json +++ b/tests/contexts/tool_use.json @@ -26,7 +26,7 @@ }, { "role": "assistant", - "content": "Anything else?" + "content": "Anything \"else\"?" }, { "role": "user", @@ -164,5 +164,19 @@ }, "type": "function" } - ] + ], + "_test_metadata": { + "_comment": "Complex tool use scenario with multiple tool calls and responses", + "expected_strings": [ + "Print a hello world message with python.", + "Anything \"else\"?", + "Test a tautology.", + "Truth is definitely true.", + "Check it on the web.", + "I don't need the web to answer you but I did check, as you asked. What now?" + ], + "forbidden_strings": ["\\\"else\\\""], + "expected_strings_if_supports_tool_calls": ["ipython", "test", "brave_search"], + "expected_strings_if_supports_tool_responses": ["Hello, World!"] + } } \ No newline at end of file diff --git a/tests/test-capabilities.cpp b/tests/test-capabilities.cpp index 8c10eaa..bc76159 100644 --- a/tests/test-capabilities.cpp +++ b/tests/test-capabilities.cpp @@ -39,6 +39,19 @@ static std::string read_file(const std::string &path) return out; } +static std::string reasoning_format_to_string(minja::ReasoningFormat format) { + switch (format) { + case minja::ReasoningFormat::NONE: return "NONE"; + case minja::ReasoningFormat::REASONING_CONTENT_FIELD: return "REASONING_CONTENT_FIELD"; + case minja::ReasoningFormat::THINKING_CONTENT_BLOCK: return "THINKING_CONTENT_BLOCK"; + case minja::ReasoningFormat::THOUGHTS_CONTENT_BLOCK: return "THOUGHTS_CONTENT_BLOCK"; + case minja::ReasoningFormat::THOUGHT_FIELD: return "THOUGHT_FIELD"; + case minja::ReasoningFormat::TOOL_PLAN_FIELD: return "TOOL_PLAN_FIELD"; + case minja::ReasoningFormat::THINKING_FIELD: return "THINKING_FIELD"; + default: return "UNKNOWN"; + } +} + static minja::chat_template_caps get_caps(const std::string &path) { auto caps = minja::chat_template(read_file(path), "", "").original_caps(); @@ -58,8 +71,15 @@ static minja::chat_template_caps get_caps(const std::string &path) print("supports_parallel_tool_calls", caps.supports_parallel_tool_calls); print("requires_object_arguments", caps.requires_object_arguments); print("requires_non_null_content", caps.requires_non_null_content); - // print("requires_non_null_content", caps.requires_non_null_content); - print("requires_typed_content", caps.requires_typed_content); + print("requires_typed_content", caps.requires_typed_content_blocks); + // Reasoning capabilities (extended thinking / chain-of-thought) + print("supports_reasoning", caps.supports_reasoning); + print("reasoning_requires_tools", caps.reasoning_requires_tools); + print("supports_reasoning_without_content", caps.supports_reasoning_without_content); + print("supports_reasoning_with_content", caps.supports_reasoning_with_content); + print("supports_enable_thinking", caps.supports_enable_thinking); + print("supports_clear_thinking", caps.supports_clear_thinking); + std::cout << " EXPECT_EQ(caps.reasoning_format, minja::ReasoningFormat::" << reasoning_format_to_string(caps.reasoning_format) << ");" << std::endl; std::cout << "}\n" << std::endl; return caps; @@ -75,7 +95,7 @@ TEST(CapabilitiesTest, Gemma7b) { EXPECT_FALSE(caps.supports_parallel_tool_calls); EXPECT_FALSE(caps.requires_object_arguments); EXPECT_FALSE(caps.requires_non_null_content); - EXPECT_FALSE(caps.requires_typed_content); + EXPECT_FALSE(caps.requires_typed_content_blocks); } TEST(CapabilitiesTest, QwQ32B) { @@ -88,7 +108,7 @@ TEST(CapabilitiesTest, QwQ32B) { EXPECT_TRUE(caps.supports_parallel_tool_calls); EXPECT_TRUE(caps.requires_object_arguments); EXPECT_TRUE(caps.requires_non_null_content); - EXPECT_FALSE(caps.requires_typed_content); + EXPECT_FALSE(caps.requires_typed_content_blocks); } TEST(CapabilitiesTest, Qwen3Coder) { @@ -101,7 +121,7 @@ TEST(CapabilitiesTest, Qwen3Coder) { EXPECT_TRUE(caps.supports_parallel_tool_calls); EXPECT_TRUE(caps.requires_object_arguments); EXPECT_FALSE(caps.requires_non_null_content); - EXPECT_FALSE(caps.requires_typed_content); + EXPECT_FALSE(caps.requires_typed_content_blocks); } #ifndef _WIN32 @@ -115,7 +135,7 @@ TEST(CapabilitiesTest, DeepSeekR1Distill) { EXPECT_TRUE(caps.supports_parallel_tool_calls); EXPECT_FALSE(caps.requires_object_arguments); EXPECT_FALSE(caps.requires_non_null_content); - EXPECT_FALSE(caps.requires_typed_content); + EXPECT_FALSE(caps.requires_typed_content_blocks); } #endif // _WIN32 @@ -129,7 +149,7 @@ TEST(CapabilitiesTest, FunctionaryMediumV3_2) { EXPECT_TRUE(caps.supports_parallel_tool_calls); EXPECT_FALSE(caps.requires_object_arguments); EXPECT_FALSE(caps.requires_non_null_content); - EXPECT_FALSE(caps.requires_typed_content); + EXPECT_FALSE(caps.requires_typed_content_blocks); } TEST(CapabilitiesTest, MetaLlama3_1_8BInstruct) { @@ -142,7 +162,7 @@ TEST(CapabilitiesTest, MetaLlama3_1_8BInstruct) { EXPECT_FALSE(caps.supports_parallel_tool_calls); EXPECT_TRUE(caps.requires_object_arguments); EXPECT_FALSE(caps.requires_non_null_content); - EXPECT_FALSE(caps.requires_typed_content); + EXPECT_FALSE(caps.requires_typed_content_blocks); } TEST(CapabilitiesTest, MetaLlama3_2_3BInstruct) { @@ -155,7 +175,7 @@ TEST(CapabilitiesTest, MetaLlama3_2_3BInstruct) { EXPECT_FALSE(caps.supports_parallel_tool_calls); EXPECT_TRUE(caps.requires_object_arguments); EXPECT_FALSE(caps.requires_non_null_content); - EXPECT_FALSE(caps.requires_typed_content); + EXPECT_FALSE(caps.requires_typed_content_blocks); } TEST(CapabilitiesTest, MetaLlama3_3_70BInstruct) { @@ -168,7 +188,7 @@ TEST(CapabilitiesTest, MetaLlama3_3_70BInstruct) { EXPECT_FALSE(caps.supports_parallel_tool_calls); EXPECT_TRUE(caps.requires_object_arguments); EXPECT_FALSE(caps.requires_non_null_content); - EXPECT_FALSE(caps.requires_typed_content); + EXPECT_FALSE(caps.requires_typed_content_blocks); } TEST(CapabilitiesTest, MiniMaxAIText01) { @@ -181,7 +201,7 @@ TEST(CapabilitiesTest, MiniMaxAIText01) { EXPECT_FALSE(caps.supports_parallel_tool_calls); EXPECT_FALSE(caps.requires_object_arguments); EXPECT_FALSE(caps.requires_non_null_content); - EXPECT_TRUE(caps.requires_typed_content); + EXPECT_TRUE(caps.requires_typed_content_blocks); } TEST(CapabilitiesTest, Mistral7BInstruct) { @@ -194,7 +214,7 @@ TEST(CapabilitiesTest, Mistral7BInstruct) { EXPECT_FALSE(caps.supports_parallel_tool_calls); EXPECT_FALSE(caps.requires_object_arguments); EXPECT_FALSE(caps.requires_non_null_content); - EXPECT_FALSE(caps.requires_typed_content); + EXPECT_FALSE(caps.requires_typed_content_blocks); } TEST(CapabilitiesTest, MistralNemoInstruct) { @@ -207,7 +227,7 @@ TEST(CapabilitiesTest, MistralNemoInstruct) { EXPECT_TRUE(caps.supports_parallel_tool_calls); EXPECT_TRUE(caps.requires_object_arguments); EXPECT_FALSE(caps.requires_non_null_content); - EXPECT_FALSE(caps.requires_typed_content); + EXPECT_FALSE(caps.requires_typed_content_blocks); } TEST(CapabilitiesTest, NousResearchHermes3Llama3_1_70BToolUse) { @@ -220,7 +240,7 @@ TEST(CapabilitiesTest, NousResearchHermes3Llama3_1_70BToolUse) { EXPECT_TRUE(caps.supports_parallel_tool_calls); EXPECT_FALSE(caps.requires_object_arguments); EXPECT_FALSE(caps.requires_non_null_content); - EXPECT_FALSE(caps.requires_typed_content); + EXPECT_FALSE(caps.requires_typed_content_blocks); } TEST(CapabilitiesTest, NousResearchHermes2ProLlama3_8BToolUse) { @@ -233,7 +253,7 @@ TEST(CapabilitiesTest, NousResearchHermes2ProLlama3_8BToolUse) { EXPECT_TRUE(caps.supports_parallel_tool_calls); EXPECT_FALSE(caps.requires_object_arguments); EXPECT_FALSE(caps.requires_non_null_content); - EXPECT_FALSE(caps.requires_typed_content); + EXPECT_FALSE(caps.requires_typed_content_blocks); } TEST(CapabilitiesTest, CommandRPlusDefault) { @@ -246,7 +266,7 @@ TEST(CapabilitiesTest, CommandRPlusDefault) { EXPECT_FALSE(caps.supports_parallel_tool_calls); EXPECT_FALSE(caps.requires_object_arguments); EXPECT_TRUE(caps.requires_non_null_content); - EXPECT_FALSE(caps.requires_typed_content); + EXPECT_FALSE(caps.requires_typed_content_blocks); } TEST(CapabilitiesTest, CommandRPlusRag) { @@ -259,7 +279,7 @@ TEST(CapabilitiesTest, CommandRPlusRag) { EXPECT_FALSE(caps.supports_parallel_tool_calls); EXPECT_FALSE(caps.requires_object_arguments); EXPECT_TRUE(caps.requires_non_null_content); - EXPECT_FALSE(caps.requires_typed_content); + EXPECT_FALSE(caps.requires_typed_content_blocks); } TEST(CapabilitiesTest, CommandRPlusToolUse) { @@ -272,7 +292,7 @@ TEST(CapabilitiesTest, CommandRPlusToolUse) { EXPECT_TRUE(caps.supports_parallel_tool_calls); EXPECT_TRUE(caps.requires_object_arguments); EXPECT_FALSE(caps.requires_non_null_content); - EXPECT_FALSE(caps.requires_typed_content); + EXPECT_FALSE(caps.requires_typed_content_blocks); } TEST(CapabilitiesTest, GLM46) { @@ -285,7 +305,7 @@ TEST(CapabilitiesTest, GLM46) { EXPECT_TRUE(caps.supports_parallel_tool_calls); EXPECT_TRUE(caps.requires_object_arguments); EXPECT_FALSE(caps.requires_non_null_content); - EXPECT_FALSE(caps.requires_typed_content); + EXPECT_FALSE(caps.requires_typed_content_blocks); } // Synthetic template based on DeepSeek V3.2's DSML format (encoding_dsv32.py) @@ -301,6 +321,116 @@ TEST(CapabilitiesTest, SyntheticDeepSeekV3_2_DSML) { EXPECT_TRUE(caps.supports_parallel_tool_calls); // Iterates over tool_calls array EXPECT_TRUE(caps.requires_object_arguments); // DSML iterates over argument keys EXPECT_FALSE(caps.requires_non_null_content); - EXPECT_FALSE(caps.requires_typed_content); + EXPECT_FALSE(caps.requires_typed_content_blocks); + // Reasoning capabilities - synthetic template doesn't support reasoning_content field + EXPECT_FALSE(caps.supports_reasoning); +} + +// Reasoning model tests +// Note: DeepSeek R1 does NOT support reasoning_content field - it looks for tags embedded in content +// These tests are for models that DO support the reasoning_content field + +#ifndef _WIN32 +TEST(CapabilitiesTest, Qwen3_235B_A22B_Thinking_2507) { + auto caps = get_caps("tests/Qwen-Qwen3-235B-A22B-Thinking-2507.jinja"); + EXPECT_TRUE(caps.supports_system_role); + EXPECT_TRUE(caps.supports_tools); + EXPECT_TRUE(caps.supports_tool_calls); + EXPECT_FALSE(caps.supports_tool_call_id); + EXPECT_TRUE(caps.supports_tool_responses); + EXPECT_TRUE(caps.supports_parallel_tool_calls); + EXPECT_FALSE(caps.requires_object_arguments); + EXPECT_FALSE(caps.requires_non_null_content); + EXPECT_FALSE(caps.requires_typed_content_blocks); + // Qwen supports reasoning_content field + EXPECT_TRUE(caps.supports_reasoning); +} + +TEST(CapabilitiesTest, GLM_4_6) { + auto caps = get_caps("tests/zai-org-GLM-4.6.jinja"); + EXPECT_TRUE(caps.supports_system_role); + EXPECT_TRUE(caps.supports_tools); + EXPECT_TRUE(caps.supports_tool_calls); + EXPECT_FALSE(caps.supports_tool_call_id); + EXPECT_TRUE(caps.supports_tool_responses); + EXPECT_TRUE(caps.supports_parallel_tool_calls); + EXPECT_TRUE(caps.requires_object_arguments); + EXPECT_FALSE(caps.requires_non_null_content); + EXPECT_FALSE(caps.requires_typed_content_blocks); + // GLM-4.6 supports reasoning_content field + EXPECT_TRUE(caps.supports_reasoning); +} +#endif // _WIN32 + +// ReasoningFormat tests - verify detection of different reasoning formats + +// Pattern A: REASONING_CONTENT (Qwen3, GLM-4.6/4.7) +TEST(ReasoningFormatTest, ReasoningContentField_GLM47) { + auto caps = get_caps("tests/zai-org-GLM-4.7.jinja"); + EXPECT_TRUE(caps.supports_reasoning); + EXPECT_EQ(caps.reasoning_format, minja::ReasoningFormat::REASONING_CONTENT_FIELD); + // GLM-4.7 supports reasoning visibility control (clear_thinking flag) + EXPECT_TRUE(caps.supports_clear_thinking); +} + +TEST(ReasoningFormatTest, ReasoningContentField_Qwen3) { + auto caps = get_caps("tests/Qwen-Qwen3-4B.jinja"); + EXPECT_TRUE(caps.supports_reasoning); + EXPECT_EQ(caps.reasoning_format, minja::ReasoningFormat::REASONING_CONTENT_FIELD); +} + +// Pattern D: THOUGHT_FIELD (MiniCPM3) +TEST(ReasoningFormatTest, ThoughtField_MiniCPM3) { + auto caps = get_caps("tests/openbmb-MiniCPM3-4B.jinja"); + EXPECT_TRUE(caps.supports_reasoning); + EXPECT_EQ(caps.reasoning_format, minja::ReasoningFormat::THOUGHT_FIELD); +} + +// Pattern E: TOOL_PLAN_FIELD (Command-R7B) - requires tools +// Note: Command-R7B is excluded on Windows (https://github.com/google/minja/issues/40) +#ifndef _WIN32 +TEST(ReasoningFormatTest, ToolPlanField_CommandR7B) { + auto caps = get_caps("tests/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja"); + EXPECT_TRUE(caps.supports_reasoning); + EXPECT_EQ(caps.reasoning_format, minja::ReasoningFormat::TOOL_PLAN_FIELD); + EXPECT_TRUE(caps.reasoning_requires_tools); +} +#endif // _WIN32 + +// Pattern NONE: Templates without reasoning support +TEST(ReasoningFormatTest, NoReasoning_Gemma7b) { + auto caps = get_caps("tests/google-gemma-7b-it.jinja"); + EXPECT_FALSE(caps.supports_reasoning); + EXPECT_EQ(caps.reasoning_format, minja::ReasoningFormat::NONE); +} + +TEST(ReasoningFormatTest, NoReasoning_Llama31) { + auto caps = get_caps("tests/meta-llama-Llama-3.1-8B-Instruct.jinja"); + EXPECT_FALSE(caps.supports_reasoning); + EXPECT_EQ(caps.reasoning_format, minja::ReasoningFormat::NONE); +} + +// Test Kimi K2 - supports reasoning via THOUGHTS_CONTENT_BLOCK +// The template's render_content macro iterates over content blocks and outputs text +TEST(ReasoningFormatTest, ThoughtsContentBlock_KimiK2) { + auto caps = get_caps("tests/moonshotai-Kimi-K2-Instruct.jinja"); + EXPECT_TRUE(caps.supports_reasoning); + EXPECT_EQ(caps.reasoning_format, minja::ReasoningFormat::THOUGHTS_CONTENT_BLOCK); + EXPECT_FALSE(caps.reasoning_requires_tools); +} + +// Test that REASONING_CONTENT_FIELD models don't require tools for reasoning +TEST(ReasoningFormatTest, ReasoningContentNoToolsRequired_Qwen3) { + auto caps = get_caps("tests/Qwen-Qwen3-4B.jinja"); + EXPECT_TRUE(caps.supports_reasoning); + EXPECT_EQ(caps.reasoning_format, minja::ReasoningFormat::REASONING_CONTENT_FIELD); + EXPECT_FALSE(caps.reasoning_requires_tools); +} + +TEST(ReasoningFormatTest, ReasoningContentNoToolsRequired_GLM47) { + auto caps = get_caps("tests/zai-org-GLM-4.7.jinja"); + EXPECT_TRUE(caps.supports_reasoning); + EXPECT_EQ(caps.reasoning_format, minja::ReasoningFormat::REASONING_CONTENT_FIELD); + EXPECT_FALSE(caps.reasoning_requires_tools); } diff --git a/tests/test-supported-template.cpp b/tests/test-supported-template.cpp index 88a9bbb..1eaad53 100644 --- a/tests/test-supported-template.cpp +++ b/tests/test-supported-template.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #undef NDEBUG #include @@ -22,6 +23,16 @@ using json = nlohmann::ordered_json; +#ifdef _WIN32 +// Workaround for https://github.com/ochafik/minja/issues/16 +// On Windows, C++ minja outputs fewer newlines than Python Jinja2 for certain templates. +// This function collapses consecutive blank lines to normalize comparison. +static std::string collapse_blank_lines(const std::string &s) { + static const std::regex blank_lines_regex("\n\n+"); + return std::regex_replace(s, blank_lines_regex, "\n"); +} +#endif + template static void assert_equals(const T &expected, const T &actual){ if (expected != actual) { @@ -76,7 +87,7 @@ static json caps_to_json(const minja::chat_template_caps &caps) { {"supports_tool_call_id", caps.supports_tool_call_id}, {"requires_object_arguments", caps.requires_object_arguments}, // {"requires_non_null_content", caps.requires_non_null_content}, - {"requires_typed_content", caps.requires_typed_content}, + {"requires_typed_content_blocks", caps.requires_typed_content_blocks}, }; } #endif @@ -152,7 +163,96 @@ int main(int argc, char *argv[]) { return 1; } - if (expected != actual) { + // Validate expected/forbidden strings from _test_metadata if present + // This provides template-independent validation that doesn't rely on Python goldens + auto original_ctx = json::parse(read_file(ctx_file)); + if (original_ctx.contains("_test_metadata")) { + auto metadata = original_ctx["_test_metadata"]; + auto caps = tmpl.original_caps(); + + // Check expected_strings (always required) + if (metadata.contains("expected_strings")) { + for (const auto& s : metadata["expected_strings"]) { + std::string expected_str = s.get(); + if (actual.find(expected_str) == std::string::npos) { + std::cerr << "Expected string not found in output: " << expected_str << "\n"; + std::cerr << "Actual output:\n" << actual << "\n"; + return 1; + } + } + } + + // Helper lambda to check expected strings + auto check_expected_strings = [&](const std::string& key, bool condition, const std::string& desc) -> bool { + if (metadata.contains(key) && condition) { + for (const auto& s : metadata[key]) { + std::string expected_str = s.get(); + if (actual.find(expected_str) == std::string::npos) { + std::cerr << "Expected string (" << desc << ") not found in output: " << expected_str << "\n"; + std::cerr << "Actual output:\n" << actual << "\n"; + return false; + } + } + } + return true; + }; + + // Check expected_strings_if_supports_system_role + if (!check_expected_strings("expected_strings_if_supports_system_role", caps.supports_system_role, "system role")) { + return 1; + } + + // Check expected_strings_if_supports_tool_calls + if (!check_expected_strings("expected_strings_if_supports_tool_calls", caps.supports_tool_calls, "tool calls")) { + return 1; + } + + // Check expected_strings_if_supports_tool_responses + if (!check_expected_strings("expected_strings_if_supports_tool_responses", caps.supports_tool_responses, "tool responses")) { + return 1; + } + + // Check expected_strings_if_supports_reasoning (with additional conditions) + // If context uses clear_thinking, only check if template supports it + // If template requires tools for reasoning (TOOL_PLAN_FIELD), only check if context has tool_calls + bool context_uses_clear_thinking = original_ctx.contains("clear_thinking"); + bool context_has_tool_calls = false; + for (const auto& msg : original_ctx["messages"]) { + if (msg.contains("tool_calls") && !msg["tool_calls"].empty()) { + context_has_tool_calls = true; + break; + } + } + bool should_check_reasoning_strings = caps.supports_reasoning + && (!context_uses_clear_thinking || caps.supports_clear_thinking) + && (!caps.reasoning_requires_tools || context_has_tool_calls); + if (!check_expected_strings("expected_strings_if_supports_reasoning", should_check_reasoning_strings, "reasoning")) { + return 1; + } + + // Check forbidden_strings (should never appear) + if (metadata.contains("forbidden_strings")) { + for (const auto& s : metadata["forbidden_strings"]) { + std::string forbidden_str = s.get(); + if (actual.find(forbidden_str) != std::string::npos) { + std::cerr << "Forbidden string found in output: " << forbidden_str << "\n"; + std::cerr << "Actual output:\n" << actual << "\n"; + return 1; + } + } + } + } + +#ifdef _WIN32 + // On Windows, collapse blank lines for comparison due to known whitespace handling issues + auto expected_cmp = collapse_blank_lines(expected); + auto actual_cmp = collapse_blank_lines(actual); +#else + auto expected_cmp = expected; + auto actual_cmp = actual; +#endif + + if (expected_cmp != actual_cmp) { if (getenv("WRITE_GOLDENS")) { write_file(golden_file, actual); std::cerr << "Updated golden file: " << golden_file << "\n"; diff --git a/tests/test-syntax.cpp b/tests/test-syntax.cpp index ebab4eb..f1d5916 100644 --- a/tests/test-syntax.cpp +++ b/tests/test-syntax.cpp @@ -262,6 +262,14 @@ TEST(SyntaxTest, SimpleCases) { EXPECT_EQ( R"({"a": "b"})", render(R"({{ {"a": "b"} | tojson }})", {}, {})); + // Test tojson with compact separators (used by Kimi K2 template) + EXPECT_EQ( + R"({"a":"b","c":[1,2]})", + render(R"({{ {"a": "b", "c": [1, 2]} | tojson(separators=(',', ':')) }})", {}, {})); + // Test tojson with exotic separators to verify they're actually used + EXPECT_EQ( + R"({"a"=>"b";"c"=>[1;2]})", + render(R"({{ {"a": "b", "c": [1, 2]} | tojson(separators=(';', '=>')) }})", {}, {})); EXPECT_EQ( R"({'a': 'b'})", render(R"({{ {"a": "b"} }})", {}, {}));