diff --git a/components/src/dynamo/common/utils/input_params.py b/components/src/dynamo/common/utils/input_params.py index e4d57f58ab33..1e8c38cde9e4 100644 --- a/components/src/dynamo/common/utils/input_params.py +++ b/components/src/dynamo/common/utils/input_params.py @@ -4,6 +4,48 @@ from typing import Any, Optional +def _inject_reasoning_content(messages: list) -> None: + """Inject reasoning_content as blocks into content. + + Chat templates only reference message["content"] — they don't see + reasoning_content. This converts it back to blocks so the + model sees its own prior chain-of-thought across turns. + """ + for msg in messages: + if msg.get("role") != "assistant": + continue + reasoning = msg.get("reasoning_content") + if not reasoning: + continue + + # Build wrapped text + if isinstance(reasoning, str): + think_text = f"{reasoning}" if reasoning else "" + elif isinstance(reasoning, list): + # Segments variant: wrap each non-empty segment + parts = [f"{seg}" for seg in reasoning if seg] + think_text = "".join(parts) + else: + continue + + if not think_text: + continue + + # Prepend to content + existing = msg.get("content") + if isinstance(existing, str): + msg["content"] = think_text + existing + elif isinstance(existing, list): + # Multimodal content array — prepend as text part + msg["content"] = [{"type": "text", "text": think_text}] + existing + else: + # null or absent + msg["content"] = think_text + + # Remove so template doesn't see both + msg.pop("reasoning_content", None) + + class InputParamManager: def __init__(self, tokenizer: Any) -> None: self.tokenizer = tokenizer @@ -18,8 +60,32 @@ def get_input_param(self, request: dict, use_tokenizer: bool) -> Optional[Any]: raise ValueError("Tokenizer is not available") if "messages" in request: + # Forward chat_template_args / chat_template_kwargs to the + # template so model-specific variables (e.g. enable_thinking) + # are available during rendering. + extra_kwargs = {} + if "chat_template_kwargs" in request: + extra_kwargs.update(request["chat_template_kwargs"]) + if "chat_template_args" in request: + extra_kwargs.update(request["chat_template_args"]) + # Strip keys that are already set explicitly to avoid + # TypeError: got multiple values for keyword argument. + for reserved in ("tokenize", "add_generation_prompt"): + extra_kwargs.pop(reserved, None) + + # Inject reasoning_content as blocks into content, + # but only if the template doesn't handle it natively. + # Templates like Nemotron and Qwen3 reference reasoning_content + # directly — injecting would produce duplicate blocks. + chat_template_src = getattr(self.tokenizer, "chat_template", "") or "" + if "reasoning_content" not in chat_template_src: + _inject_reasoning_content(request["messages"]) + return self.tokenizer.apply_chat_template( - request["messages"], tokenize=False, add_generation_prompt=True + request["messages"], + tokenize=False, + add_generation_prompt=True, + **extra_kwargs, ) elif "prompt" in request: return self.tokenizer.encode(request["prompt"]) diff --git a/components/src/dynamo/common/utils/tests/test_inject_reasoning_content.py b/components/src/dynamo/common/utils/tests/test_inject_reasoning_content.py new file mode 100644 index 000000000000..98ea9ee0ead0 --- /dev/null +++ b/components/src/dynamo/common/utils/tests/test_inject_reasoning_content.py @@ -0,0 +1,235 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Unit tests for _inject_reasoning_content in input_params.py. + +Verifies that reasoning_content from prior assistant turns is converted +to blocks in the content field before chat template rendering. +""" + +import copy + +from dynamo.common.utils.input_params import _inject_reasoning_content + + +class TestInjectReasoningContent: + """Test suite for _inject_reasoning_content""" + + def test_text_variant_prepends_to_content(self): + """Text reasoning_content is wrapped in and prepended.""" + messages = [ + { + "role": "assistant", + "content": "The answer is 12.", + "reasoning_content": "sqrt(144) = 12", + }, + ] + _inject_reasoning_content(messages) + + assert ( + messages[0]["content"] == "sqrt(144) = 12The answer is 12." + ) + assert "reasoning_content" not in messages[0] + + def test_segments_variant_wraps_each_segment(self): + """Segments are individually wrapped in blocks.""" + messages = [ + { + "role": "assistant", + "content": "Done.", + "reasoning_content": ["first thought", "second thought", ""], + }, + ] + _inject_reasoning_content(messages) + + content = messages[0]["content"] + assert content.startswith("first thought") + assert "second thought" in content + assert "" not in content # empty segment skipped + assert content.endswith("Done.") + assert "reasoning_content" not in messages[0] + + def test_null_content_creates_from_reasoning(self): + """When content is null/None, reasoning becomes the content.""" + messages = [ + {"role": "assistant", "content": None, "reasoning_content": "Thinking..."}, + ] + _inject_reasoning_content(messages) + + assert messages[0]["content"] == "Thinking..." + + def test_absent_content_creates_from_reasoning(self): + """When content key is absent, reasoning becomes the content.""" + messages = [ + {"role": "assistant", "reasoning_content": "Thinking..."}, + ] + _inject_reasoning_content(messages) + + assert messages[0]["content"] == "Thinking..." + + def test_multimodal_content_prepends_text_part(self): + """Array content gets a text part prepended, not replaced.""" + messages = [ + { + "role": "assistant", + "content": [{"type": "text", "text": "Here is the image."}], + "reasoning_content": "Analyzing the image...", + }, + ] + _inject_reasoning_content(messages) + + content = messages[0]["content"] + assert isinstance(content, list) + assert len(content) == 2 + assert content[0] == { + "type": "text", + "text": "Analyzing the image...", + } + assert content[1] == {"type": "text", "text": "Here is the image."} + + def test_skips_non_assistant_messages(self): + """User and tool messages are not modified.""" + messages = [ + { + "role": "user", + "content": "hello", + "reasoning_content": "should not touch", + }, + { + "role": "tool", + "content": "result", + "reasoning_content": "should not touch", + }, + ] + original = copy.deepcopy(messages) + _inject_reasoning_content(messages) + + assert messages == original + + def test_skips_empty_reasoning(self): + """Empty string reasoning_content is skipped.""" + messages = [ + {"role": "assistant", "content": "Answer.", "reasoning_content": ""}, + ] + _inject_reasoning_content(messages) + + assert messages[0]["content"] == "Answer." + # reasoning_content not removed since we skipped (falsy check) + + def test_agentic_multi_turn_tool_call_flow(self): + """Full agentic flow: reason → tool_call → tool_result → reason → answer.""" + messages = [ + {"role": "user", "content": "What is sqrt(144) + sqrt(256)?"}, + { + "role": "assistant", + "content": None, + "reasoning_content": "I need to compute sqrt(144) first.", + "tool_calls": [ + { + "id": "call_0", + "type": "function", + "function": { + "name": "calc", + "arguments": '{"expr": "sqrt(144)"}', + }, + }, + ], + }, + {"role": "tool", "tool_call_id": "call_0", "content": "12"}, + { + "role": "assistant", + "content": "The answer is 28.", + "reasoning_content": "Got 12. sqrt(256) = 16. Sum = 28.", + }, + {"role": "user", "content": "Thanks!"}, + ] + _inject_reasoning_content(messages) + + # First assistant turn: reasoning injected, null content → reasoning only + assert ( + messages[1]["content"] + == "I need to compute sqrt(144) first." + ) + assert "reasoning_content" not in messages[1] + assert "tool_calls" in messages[1] # tool_calls untouched + + # Tool message untouched + assert messages[2]["content"] == "12" + + # Second assistant turn: reasoning prepended to content + assert ( + messages[3]["content"] + == "Got 12. sqrt(256) = 16. Sum = 28.The answer is 28." + ) + assert "reasoning_content" not in messages[3] + + # User messages untouched + assert messages[0]["content"] == "What is sqrt(144) + sqrt(256)?" + assert messages[4]["content"] == "Thanks!" + + +class TestInputParamManagerReasoningInjection: + """Test that InputParamManager respects template introspection.""" + + def test_injects_when_template_ignores_reasoning(self): + """Templates without reasoning_content get injection.""" + from unittest.mock import MagicMock + + tokenizer = MagicMock() + tokenizer.chat_template = ( + "{% for m in messages %}{{ m.role }}: {{ m.content }}{% endfor %}" + ) + tokenizer.apply_chat_template = MagicMock(return_value="rendered") + + from dynamo.common.utils.input_params import InputParamManager + + mgr = InputParamManager(tokenizer) + request = { + "messages": [ + { + "role": "assistant", + "content": "Hi.", + "reasoning_content": "thinking...", + }, + {"role": "user", "content": "Bye"}, + ] + } + mgr.get_input_param(request, use_tokenizer=True) + + # Verify injection happened: reasoning_content removed, content has + called_messages = tokenizer.apply_chat_template.call_args[0][0] + assert "reasoning_content" not in called_messages[0] + assert called_messages[0]["content"].startswith("thinking...") + + def test_skips_injection_when_template_handles_reasoning(self): + """Templates with reasoning_content are left alone.""" + from unittest.mock import MagicMock + + tokenizer = MagicMock() + tokenizer.chat_template = ( + "{% for m in messages %}" + "{% if m.reasoning_content %}{{ m.reasoning_content }}{% endif %}" + "{{ m.role }}: {{ m.content }}{% endfor %}" + ) + tokenizer.apply_chat_template = MagicMock(return_value="rendered") + + from dynamo.common.utils.input_params import InputParamManager + + mgr = InputParamManager(tokenizer) + request = { + "messages": [ + { + "role": "assistant", + "content": "Hi.", + "reasoning_content": "thinking...", + }, + {"role": "user", "content": "Bye"}, + ] + } + mgr.get_input_param(request, use_tokenizer=True) + + # Verify injection was skipped: reasoning_content still present, content unchanged + called_messages = tokenizer.apply_chat_template.call_args[0][0] + assert called_messages[0]["reasoning_content"] == "thinking..." + assert called_messages[0]["content"] == "Hi." diff --git a/lib/llm/src/http/service/anthropic.rs b/lib/llm/src/http/service/anthropic.rs index 1bf799bb6592..15e798cd8c04 100644 --- a/lib/llm/src/http/service/anthropic.rs +++ b/lib/llm/src/http/service/anthropic.rs @@ -33,7 +33,6 @@ use super::{ metrics::{CancellationLabels, Endpoint, process_response_and_observe_metrics}, service_v2, }; -use crate::preprocessor::OpenAIPreprocessor; use crate::protocols::anthropic::stream_converter::AnthropicStreamConverter; use crate::protocols::anthropic::types::{ AnthropicCountTokensRequest, AnthropicCountTokensResponse, AnthropicCreateMessageRequest, @@ -192,19 +191,30 @@ async fn anthropic_messages( tracing::trace!("Received Anthropic messages request: {:?}", &*request); + // Look up engine and parsing options early so we know whether a reasoning + // parser is configured before converting the request. + let (engine, parsing_options) = state + .manager() + .get_chat_completions_engine_with_parsing(&model) + .map_err(|_| { + anthropic_error( + StatusCode::NOT_FOUND, + "not_found_error", + &format!("Model '{}' not found", model), + ) + })?; + let (orig_request, context) = request.into_parts(); let model_for_resp = orig_request.model.clone(); - // Check if the Anthropic request explicitly enabled thinking. When thinking - // is enabled, reasoning-capable models' chat templates typically inject - // `` into the prompt, so the completion starts mid-reasoning. - let thinking_enabled = orig_request + // Check if the Anthropic request explicitly disabled thinking. + let thinking_explicitly_disabled = orig_request .thinking .as_ref() - .is_some_and(|t| t.thinking_type == "enabled"); + .is_some_and(|t| t.thinking_type == "disabled"); // Convert Anthropic request -> Chat Completion request - let chat_request: NvCreateChatCompletionRequest = + let mut chat_request: NvCreateChatCompletionRequest = orig_request.try_into().map_err(|e: anyhow::Error| { tracing::error!( request_id, @@ -218,20 +228,42 @@ async fn anthropic_messages( ) })?; - let request = context.map(|_req| chat_request); - - tracing::trace!("Getting chat completions engine for model: {}", model); + // When a reasoning parser is configured and the client hasn't explicitly + // disabled thinking, assume the model's chat template will inject ``. + // + // Two things must be aligned: + // 1. chat_template_args must include enable_thinking=true so the backend's + // template actually injects `` into the prompt. For the + // ModelInput::Text path (SGLang without --skip-tokenizer-init), the + // backend applies the template — without explicit enable_thinking the + // result depends on the template's default which varies by model. + // 2. prompt_injected_reasoning must be true so the parser starts in + // reasoning mode with stripped_think_start=true, which is critical for + // correct `` boundary detection in the streaming path. + // + // The OpenAI path handles this in the preprocessor: it renders the template, + // inspects the formatted prompt for a trailing ``, and sets + // prompt_injected_reasoning accordingly. The Anthropic path bypasses the + // preprocessor, so we infer prompt injection from the reasoning parser config. + let prompt_injected_reasoning = + parsing_options.reasoning_parser.is_some() && !thinking_explicitly_disabled; + + if prompt_injected_reasoning { + let args = chat_request + .chat_template_args + .get_or_insert_with(Default::default); + args.entry("enable_thinking".to_string()) + .or_insert(serde_json::Value::Bool(true)); + // Preserve reasoning from prior turns. Some templates (Nemotron) + // strip historical content by default to save context. + // For agentic flows the model needs to see why it made prior decisions. + // Ref: NVIDIA's SWE training config also sets this to false: + // https://github.com/NVIDIA-NeMo/Nemotron/blob/main/src/nemotron/recipes/super3/stage2_rl/stage2_swe2/config/default.yaml#L287 + args.entry("truncate_history_thinking".to_string()) + .or_insert(serde_json::Value::Bool(false)); + } - let (engine, parsing_options) = state - .manager() - .get_chat_completions_engine_with_parsing(&model) - .map_err(|_| { - anthropic_error( - StatusCode::NOT_FOUND, - "not_found_error", - &format!("Model '{}' not found", model), - ) - })?; + let request = context.map(|_req| chat_request); let mut response_collector = state.metrics_clone().create_response_collector(&model); @@ -247,27 +279,25 @@ async fn anthropic_messages( let ctx = engine_stream.context(); - // Apply reasoning parser to the engine stream if configured. - // The preprocessor (which normally handles this for the OpenAI path) is - // bypassed by the Anthropic endpoint, so we apply the same stream - // transform here. This populates `delta.reasoning_content` which the - // AnthropicStreamConverter translates into thinking content blocks. + // NOTE: We intentionally do NOT apply a reasoning parser here. // - // When thinking is enabled, the model's chat template likely injected - // `` into the prompt (e.g., Qwen3.5), so the parser must start - // in reasoning mode — the completion begins mid-reasoning without an - // explicit `` tag. + // For ModelInput::Tokens backends (skip_tokenizer_init=True), the engine + // pipeline includes the OpenAI preprocessor which already applies reasoning + // parsing in its backward edge (postprocessor_parsing_stream). The stream + // arriving here already has reasoning_content and content correctly split. + // Applying a second parser would re-classify post-think content chunks + // (where reasoning_content=None, content=Some) as reasoning, because the + // boundary was consumed by the first parser and doesn't appear + // in the detokenized text. + // + // For ModelInput::Text backends (PushRouter, no preprocessor), reasoning + // parsing is NOT handled in the streaming path — the backend puts raw text + // (including tags) in delta.content with reasoning_content=None. + // This is a known gap that affects all streaming handlers (OpenAI, Anthropic, + // Responses API) equally. let engine_stream: Pin< Box> + Send>, - > = if let Some(ref reasoning_parser_name) = parsing_options.reasoning_parser { - Box::pin(OpenAIPreprocessor::parse_reasoning_content_from_stream( - engine_stream, - reasoning_parser_name.clone(), - thinking_enabled, - )) - } else { - Box::pin(engine_stream) - }; + > = Box::pin(engine_stream); let mut inflight_guard = state diff --git a/lib/llm/src/preprocessor/prompt/template.rs b/lib/llm/src/preprocessor/prompt/template.rs index a2b8a47a4674..f51bdca29928 100644 --- a/lib/llm/src/preprocessor/prompt/template.rs +++ b/lib/llm/src/preprocessor/prompt/template.rs @@ -135,6 +135,9 @@ struct HfTokenizerConfigJsonFormatter { /// When true, strip tool definitions from the chat template when tool_choice is "none". /// This prevents models from generating raw XML tool calls in the content field. exclude_tools_when_tool_choice_none: bool, + /// True if the chat template natively references `reasoning_content`. + /// When true, skip injection — the template handles it. + template_handles_reasoning: bool, } // /// OpenAI Standard Prompt Formatter diff --git a/lib/llm/src/preprocessor/prompt/template/formatters.rs b/lib/llm/src/preprocessor/prompt/template/formatters.rs index 12171b890f41..909ad610bc9f 100644 --- a/lib/llm/src/preprocessor/prompt/template/formatters.rs +++ b/lib/llm/src/preprocessor/prompt/template/formatters.rs @@ -161,6 +161,12 @@ impl HfTokenizerConfigJsonFormatter { // Detect at model load time whether this template requires content arrays let requires_content_arrays = detect_content_array_usage(&env); + // Detect if the template natively handles reasoning_content (e.g. Nemotron, Qwen3). + // If so, we must NOT inject blocks — the template does it itself. + let template_handles_reasoning = env + .templates() + .any(|(_, tmpl)| tmpl.source().contains("reasoning_content")); + Ok(HfTokenizerConfigJsonFormatter { env, config, @@ -168,6 +174,7 @@ impl HfTokenizerConfigJsonFormatter { supports_add_generation_prompt: supports_add_generation_prompt.unwrap_or(false), requires_content_arrays, exclude_tools_when_tool_choice_none, + template_handles_reasoning, }) } } diff --git a/lib/llm/src/preprocessor/prompt/template/oai.rs b/lib/llm/src/preprocessor/prompt/template/oai.rs index f6392b0e0028..cfb4ad9c3afa 100644 --- a/lib/llm/src/preprocessor/prompt/template/oai.rs +++ b/lib/llm/src/preprocessor/prompt/template/oai.rs @@ -203,6 +203,82 @@ fn normalize_tool_arguments_in_messages(messages: &mut serde_json::Value) { } } +/// Inject `reasoning_content` back into the `content` field as `` blocks. +/// +/// Chat templates only reference `{{ message.content }}` — they don't know about +/// `reasoning_content`. Without this injection, the model's prior chain-of-thought +/// is silently dropped across turns. +/// +/// Uses ``/`` delimiters — the same tags that reasoning models emit +/// and that the reasoning parser strips on output. Reasoning is prepended to content +/// to match the original generation order (`... response`). +/// +/// Segments are concatenated rather than interleaved with tool_calls because Jinja +/// templates render `tool_calls` separately from `content`. The model still sees +/// all reasoning text before the template-rendered tool call block. +fn inject_reasoning_content_into_messages(messages: &mut serde_json::Value) { + let Some(msgs) = messages.as_array_mut() else { + return; + }; + + for msg in msgs.iter_mut() { + if msg.get("role").and_then(|r| r.as_str()) != Some("assistant") { + continue; + } + + let reasoning = match msg.get("reasoning_content") { + Some(serde_json::Value::String(s)) if !s.is_empty() => { + format!("{}", s) + } + Some(serde_json::Value::Array(segments)) => { + let mut result = String::new(); + for seg in segments { + if let Some(s) = seg.as_str() + && !s.is_empty() + { + result.push_str(""); + result.push_str(s); + result.push_str(""); + } + } + if result.is_empty() { + continue; + } + result + } + _ => continue, + }; + + match msg.get("content") { + // Content is a string or null — prepend reasoning as text + Some(serde_json::Value::String(s)) if !s.is_empty() => { + msg["content"] = serde_json::Value::String(format!("{}{}", reasoning, s)); + } + None | Some(serde_json::Value::Null) | Some(serde_json::Value::String(_)) => { + msg["content"] = serde_json::Value::String(reasoning); + } + // Content is an array (multimodal) — prepend as a text part + Some(serde_json::Value::Array(_)) => { + let think_part = serde_json::json!({ + "type": "text", + "text": reasoning + }); + if let Some(arr) = msg.get_mut("content").and_then(|v| v.as_array_mut()) { + arr.insert(0, think_part); + } + } + // Other types (number, bool, object) — skip, don't corrupt + _ => continue, + } + + // Remove so the template doesn't see both the injected in content + // and the original reasoning_content field. + if let Some(obj) = msg.as_object_mut() { + obj.remove("reasoning_content"); + } + } +} + impl OAIChatLikeRequest for NvCreateChatCompletionRequest { fn model(&self) -> String { self.inner.model.clone() @@ -378,6 +454,14 @@ impl OAIPromptFormatter for HfTokenizerConfigJsonFormatter { normalize_tool_arguments_in_messages(&mut messages_for_template); + // Inject reasoning_content as blocks into content — but only if + // the template doesn't handle it natively. Templates like Nemotron and + // Qwen3 reference reasoning_content directly in their Jinja logic; injecting + // would produce duplicate blocks. + if !self.template_handles_reasoning { + inject_reasoning_content_into_messages(&mut messages_for_template); + } + let ctx = context! { messages => messages_for_template, tools => tools, @@ -1312,4 +1396,337 @@ NORMAL_MODE result ); } + + #[test] + fn test_inject_reasoning_content_segments_with_tool_calls() { + // Assistant message with reasoning_content segments and tool_calls + let mut messages = serde_json::json!([ + { + "role": "user", + "content": "What is sqrt(144) and sqrt(256)?" + }, + { + "role": "assistant", + "content": "Let me calculate those.", + "reasoning_content": ["I need to compute sqrt(144)", "Now sqrt(256)", ""], + "tool_calls": [ + { + "id": "call_0", + "type": "function", + "function": { + "name": "calculator", + "arguments": "{\"expr\": \"sqrt(144)\"}" + } + }, + { + "id": "call_1", + "type": "function", + "function": { + "name": "calculator", + "arguments": "{\"expr\": \"sqrt(256)\"}" + } + } + ] + } + ]); + + inject_reasoning_content_into_messages(&mut messages); + + let assistant = &messages[1]; + + // reasoning_content should be removed + assert!( + assistant.get("reasoning_content").is_none(), + "reasoning_content should be removed after injection" + ); + + // content should have blocks prepended (empty segment skipped) + let content = assistant["content"].as_str().unwrap(); + assert!( + content.starts_with("I need to compute sqrt(144)"), + "content should start with first reasoning segment, got: {}", + content + ); + assert!( + content.contains("Now sqrt(256)"), + "content should contain second reasoning segment" + ); + // Empty third segment should NOT produce + assert!( + !content.contains(""), + "empty segments should be skipped" + ); + // Original content should be preserved at the end + assert!( + content.ends_with("Let me calculate those."), + "original content should be at the end, got: {}", + content + ); + + // tool_calls should be untouched + assert!(assistant.get("tool_calls").is_some()); + assert_eq!(assistant["tool_calls"].as_array().unwrap().len(), 2); + } + + #[test] + fn test_inject_reasoning_content_text_variant() { + let mut messages = serde_json::json!([ + { + "role": "assistant", + "content": "The answer is 42.", + "reasoning_content": "Let me think about this carefully." + } + ]); + + inject_reasoning_content_into_messages(&mut messages); + + let assistant = &messages[0]; + assert!(assistant.get("reasoning_content").is_none()); + let content = assistant["content"].as_str().unwrap(); + assert_eq!( + content, + "Let me think about this carefully.The answer is 42." + ); + } + + #[test] + fn test_inject_reasoning_content_null_content() { + // reasoning_content present but content is null + let mut messages = serde_json::json!([ + { + "role": "assistant", + "content": null, + "reasoning_content": "Thinking...", + "tool_calls": [{"id": "call_0", "type": "function", "function": {"name": "f", "arguments": "{}"}}] + } + ]); + + inject_reasoning_content_into_messages(&mut messages); + + let content = messages[0]["content"].as_str().unwrap(); + assert_eq!(content, "Thinking..."); + assert!(messages[0].get("reasoning_content").is_none()); + } + + #[test] + fn test_inject_reasoning_content_skips_non_assistant() { + let mut messages = serde_json::json!([ + { + "role": "user", + "content": "hello", + "reasoning_content": "should not be touched" + } + ]); + + inject_reasoning_content_into_messages(&mut messages); + + // User message should be untouched + assert!(messages[0].get("reasoning_content").is_some()); + } + + // Helper: create a formatter with a minimal chat template for render tests + fn make_test_formatter() -> HfTokenizerConfigJsonFormatter { + use super::tokcfg::ChatTemplate; + use super::{ContextMixins, HfTokenizerConfigJsonFormatter}; + + // Minimal template that renders content verbatim — enough to verify + // that reasoning_content injection works through the full pipeline. + let template = r#"{%- for message in messages %}{{ message.role }}: {{ message.content }} +{%- endfor %} +{%- if add_generation_prompt %}assistant:{%- endif %}"#; + + let chat_template: ChatTemplate = serde_json::from_value(serde_json::json!({ + "chat_template": template + })) + .unwrap(); + + HfTokenizerConfigJsonFormatter::new(chat_template, ContextMixins::new(&[])).unwrap() + } + + // Verify reasoning_content (Text variant) from a prior assistant turn + // appears as a block in the rendered prompt. + #[test] + fn test_reasoning_content_text_roundtrip_render() { + use super::OAIPromptFormatter; + let formatter = make_test_formatter(); + + let request: NvCreateChatCompletionRequest = serde_json::from_value(serde_json::json!({ + "model": "test-model", + "messages": [ + {"role": "user", "content": "What is sqrt(144)?"}, + { + "role": "assistant", + "content": "The answer is 12.", + "reasoning_content": "I need to compute the square root of 144." + }, + {"role": "user", "content": "Are you sure?"} + ] + })) + .unwrap(); + + let rendered = formatter.render(&request).unwrap(); + + assert!( + rendered.contains("I need to compute the square root of 144."), + "reasoning_content must appear as block, got: {}", + rendered + ); + assert!( + rendered.contains("The answer is 12."), + "original content must be preserved" + ); + assert!( + !rendered.contains("reasoning_content"), + "raw reasoning_content field should not leak into prompt" + ); + } + + // Verify a full agentic flow: assistant reasons, calls a tool, gets a + // result, then reasons again before answering. Both reasoning turns must + // survive into the rendered prompt. + #[test] + fn test_reasoning_content_agentic_tool_call_roundtrip_render() { + use super::OAIPromptFormatter; + let formatter = make_test_formatter(); + + let request: NvCreateChatCompletionRequest = serde_json::from_value(serde_json::json!({ + "model": "test-model", + "messages": [ + {"role": "user", "content": "What is sqrt(144) + sqrt(256)?"}, + { + "role": "assistant", + "content": null, + "reasoning_content": "I need to compute both square roots. Let me start with sqrt(144).", + "tool_calls": [{ + "id": "call_0", + "type": "function", + "function": { + "name": "calculator", + "arguments": "{\"expr\": \"sqrt(144)\"}" + } + }] + }, + { + "role": "tool", + "tool_call_id": "call_0", + "content": "12" + }, + { + "role": "assistant", + "content": "sqrt(144) = 12 and sqrt(256) = 16, so the answer is 28.", + "reasoning_content": "Got 12 for sqrt(144). Now sqrt(256) = 16. Sum is 28." + }, + {"role": "user", "content": "Thanks!"} + ] + })) + .unwrap(); + + let rendered = formatter.render(&request).unwrap(); + + // First assistant turn: reasoning with tool call, null content + assert!( + rendered.contains("I need to compute both square roots"), + "first turn reasoning must be in prompt, got: {}", + rendered + ); + // Second assistant turn: reasoning with final answer + assert!( + rendered.contains("Got 12 for sqrt(144)"), + "second turn reasoning must be in prompt" + ); + assert!( + rendered.contains("the answer is 28"), + "final answer content must be preserved" + ); + // No raw reasoning_content in output + assert!( + !rendered.contains("reasoning_content"), + "raw reasoning_content field should not leak into prompt" + ); + } + + // Template that does NOT reference reasoning_content — injection should happen. + #[test] + fn test_reasoning_injected_when_template_ignores_it() { + use super::OAIPromptFormatter; + let formatter = make_test_formatter(); + + // Formatter uses a simple template that doesn't reference reasoning_content + assert!(!formatter.template_handles_reasoning); + + let request: NvCreateChatCompletionRequest = serde_json::from_value(serde_json::json!({ + "model": "test-model", + "messages": [ + {"role": "user", "content": "Hello"}, + { + "role": "assistant", + "content": "Hi.", + "reasoning_content": "The user said hello." + }, + {"role": "user", "content": "Bye"} + ] + })) + .unwrap(); + + let rendered = formatter.render(&request).unwrap(); + assert!( + rendered.contains("The user said hello."), + "injection must happen when template ignores reasoning_content, got: {}", + rendered + ); + } + + // Template that DOES reference reasoning_content — injection must be skipped. + #[test] + fn test_reasoning_not_injected_when_template_handles_it() { + use super::tokcfg::ChatTemplate; + use super::{ContextMixins, HfTokenizerConfigJsonFormatter, OAIPromptFormatter}; + + // Template that natively renders reasoning_content (like Nemotron/Qwen3) + let template = r#"{%- for message in messages %}{%- if message.role == "assistant" and message.reasoning_content is defined and message.reasoning_content %}{{ message.reasoning_content }} +{%- endif %}{{ message.role }}: {{ message.content }} +{%- endfor %} +{%- if add_generation_prompt %}assistant:{%- endif %}"#; + + let chat_template: ChatTemplate = serde_json::from_value(serde_json::json!({ + "chat_template": template + })) + .unwrap(); + + let formatter = + HfTokenizerConfigJsonFormatter::new(chat_template, ContextMixins::new(&[])).unwrap(); + + // Verify detection worked + assert!(formatter.template_handles_reasoning); + + let request: NvCreateChatCompletionRequest = serde_json::from_value(serde_json::json!({ + "model": "test-model", + "messages": [ + {"role": "user", "content": "Hello"}, + { + "role": "assistant", + "content": "Hi.", + "reasoning_content": "The user said hello." + }, + {"role": "user", "content": "Bye"} + ] + })) + .unwrap(); + + let rendered = formatter.render(&request).unwrap(); + + // Template renders reasoning natively — no duplicate injection + assert!( + rendered.contains("The user said hello."), + "template must render reasoning_content natively, got: {}", + rendered + ); + // Must NOT have double blocks + let think_count = rendered.matches("").count(); + assert_eq!( + think_count, 1, + "must have exactly one block (from template), got {} in: {}", + think_count, rendered + ); + } } diff --git a/lib/llm/src/protocols/anthropic/types.rs b/lib/llm/src/protocols/anthropic/types.rs index 2d24eec41000..209010ecb40e 100644 --- a/lib/llm/src/protocols/anthropic/types.rs +++ b/lib/llm/src/protocols/anthropic/types.rs @@ -900,7 +900,22 @@ impl TryFrom for NvCreateChatCompletionRequest { ..Default::default() }) }, - chat_template_args: None, + // chat_template_args may be augmented by the Anthropic handler + // (anthropic.rs) after conversion — e.g., setting enable_thinking=true + // when a reasoning parser is configured. The conversion layer only + // forwards the client's explicit thinking preference here; the handler + // has access to parsing_options and makes the final decision. + chat_template_args: if req + .thinking + .as_ref() + .is_some_and(|t| t.thinking_type == "enabled") + { + let mut args = std::collections::HashMap::new(); + args.insert("enable_thinking".to_string(), serde_json::Value::Bool(true)); + Some(args) + } else { + None + }, media_io_kwargs: None, unsupported_fields: Default::default(), }) diff --git a/lib/parsers/src/reasoning/mod.rs b/lib/parsers/src/reasoning/mod.rs index bed1e67cb7af..9e19de53ed0e 100644 --- a/lib/parsers/src/reasoning/mod.rs +++ b/lib/parsers/src/reasoning/mod.rs @@ -389,4 +389,75 @@ mod tests { assert_eq!(r_k25.reasoning_text, "reasoning"); assert_eq!(r_k25.normal_text, "answer"); } + + // Scenario 1: Normal streaming flow with force_reasoning + set_in_reasoning. + // Simulates the OpenAI path where the preprocessor detects prompt-injected + // reasoning and calls set_in_reasoning(true). The parser should correctly + // transition from reasoning to content when arrives. + #[test] + fn test_nemotron_streaming_with_set_in_reasoning() { + let mut parser = ReasoningParserType::DeepseekR1.get_reasoning_parser(); + parser.set_in_reasoning(true); // OpenAI path calls this + + let tokens = &["Think", "ing about", " this", ".\n\n", "", "Four"]; + + let mut all_reasoning = String::new(); + let mut all_content = String::new(); + for token in tokens { + let r = parser.parse_reasoning_streaming_incremental(token, &[]); + all_reasoning.push_str(&r.reasoning_text); + all_content.push_str(&r.normal_text); + } + assert_eq!(all_reasoning, "Thinking about this.\n\n"); + assert_eq!(all_content, "Four"); + } + + // Scenario 2: Streaming with force_reasoning but WITHOUT set_in_reasoning. + // Simulates the Anthropic path bug where thinking_enabled=false and + // set_in_reasoning is never called. The parser still starts in reasoning + // mode (force_reasoning=true) but stripped_think_start=false. The + // boundary must still be detected correctly. + #[test] + fn test_nemotron_streaming_force_reasoning_without_set_in_reasoning() { + // DeepseekR1 has force_reasoning=true but we do NOT call set_in_reasoning + let mut parser = ReasoningParserType::DeepseekR1.get_reasoning_parser(); + + let tokens = &["Think", "ing about", " this", ".\n\n", "", "Four"]; + + let mut all_reasoning = String::new(); + let mut all_content = String::new(); + for token in tokens { + let r = parser.parse_reasoning_streaming_incremental(token, &[]); + all_reasoning.push_str(&r.reasoning_text); + all_content.push_str(&r.normal_text); + } + assert_eq!(all_reasoning, "Thinking about this.\n\n"); + assert_eq!(all_content, "Four"); + } + + // Scenario 3: Token-by-token split across chunks. + // The '<' in '' is a prefix of ''. When stripped_think_start + // is false, the parser's prefix-check could buffer '<' and interfere with + // detection. This test verifies the boundary is detected even when + // arrives as individual characters. + #[test] + fn test_nemotron_streaming_split_end_think_tokens() { + let mut parser = ReasoningParserType::DeepseekR1.get_reasoning_parser(); + parser.set_in_reasoning(true); + + // Simulate token-by-token arrival including split across chunks + let tokens = &[ + "reason", "ing", " done", ".", "", "Hello", " world", + ]; + + let mut all_reasoning = String::new(); + let mut all_content = String::new(); + for token in tokens { + let r = parser.parse_reasoning_streaming_incremental(token, &[]); + all_reasoning.push_str(&r.reasoning_text); + all_content.push_str(&r.normal_text); + } + assert_eq!(all_reasoning, "reasoning done."); + assert_eq!(all_content, "Hello world"); + } }