Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 74 additions & 2 deletions vllm/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import time
from collections.abc import AsyncGenerator, AsyncIterator
from collections.abc import Sequence as GenericSequence
from typing import Final
from typing import Final, cast

import jinja2
import partial_json_parser
Expand All @@ -18,6 +18,7 @@
from vllm.entrypoints.chat_utils import (
ChatTemplateContentFormatOption,
ConversationMessage,
apply_hf_chat_template,
get_history_tool_calls_cnt,
make_tool_call_id,
)
Expand Down Expand Up @@ -250,7 +251,11 @@ async def create_chat_completion(
)
else:
# For GPT-OSS.
conversation, engine_prompts = self._make_request_with_harmony(request)
(
conversation,
request_prompts,
engine_prompts,
) = self._make_request_with_harmony(request, tokenizer)
except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e:
logger.exception("Error in preprocessing prompt inputs")
return self.create_error_response(f"{e} {e.__cause__}")
Expand Down Expand Up @@ -1783,6 +1788,7 @@ def _should_check_for_unstreamed_tool_arg_tokens(
def _make_request_with_harmony(
self,
request: ChatCompletionRequest,
tokenizer: TokenizerLike,
):
messages: list[OpenAIMessage] = []

Expand Down Expand Up @@ -1813,6 +1819,72 @@ def _make_request_with_harmony(

# Render prompt token ids.
prompt_token_ids = render_for_completion(messages)

# If a chat template is provided, allow rendering through it even in
# Harmony mode, then tokenize. On failure, fall back to Harmony render.
if self.chat_template:
try:
conversation: list[ConversationMessage] = []
for msg in request.messages:
model_dump_fn = getattr(msg, "model_dump", None)
if callable(model_dump_fn):
conversation.append(model_dump_fn(exclude_none=True))
Comment on lines +1827 to +1831
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Preserve Harmony system/dev prompts when templating

When a chat_template is configured in the Harmony chat-completions path, the template rendering rebuilds conversation solely from request.messages and then replaces prompt_token_ids with the templated output. That block omits the Harmony-specific system and developer messages assembled just above (tool guidance, reasoning-effort instructions, etc.), so successful template rendering sends the model a prompt without those guardrails or tool metadata whenever a server chat template is enabled.

Useful? React with 👍 / 👎.

elif isinstance(msg, dict):
conversation.append(msg) # type: ignore[arg-type]
else:
conversation.append(cast(ConversationMessage, msg))

tools = None
if request.tools:
tools = [tool.model_dump() for tool in request.tools]

prompt_text = apply_hf_chat_template(
tokenizer=getattr(tokenizer, "hf_tokenizer", tokenizer),
conversation=conversation,
chat_template=self.chat_template,
tools=tools,
model_config=self.model_config,
)
tokenized = tokenizer(prompt_text, add_special_tokens=False)
rendered_ids = (
tokenized["input_ids"] if isinstance(tokenized, dict) else tokenized
)
normalized_ids: list[int] | None = None
if isinstance(rendered_ids, list):
if rendered_ids and all(isinstance(t, int) for t in rendered_ids):
normalized_ids = rendered_ids
elif rendered_ids and isinstance(rendered_ids[0], list):
candidate = rendered_ids[0]
if all(isinstance(t, int) for t in candidate):
normalized_ids = candidate
elif hasattr(rendered_ids, "tolist"):
flat = rendered_ids.tolist()
if flat and isinstance(flat[0], list):
flat = flat[0]
if flat and all(isinstance(t, int) for t in flat):
normalized_ids = flat

if normalized_ids is None and hasattr(tokenizer, "encode"):
try:
normalized_ids = tokenizer.encode(
prompt_text, add_special_tokens=False
)
except Exception:
normalized_ids = None

if normalized_ids is not None:
prompt_token_ids = normalized_ids
else:
logger.warning(
"Harmony chat template returned non-int tokens; "
"falling back to Harmony default"
)
except Exception:
logger.warning(
"Harmony chat template rendering failed; using Harmony default",
exc_info=True,
)

engine_prompt = TokensPrompt(prompt_token_ids=prompt_token_ids)

# Add cache_salt if provided in the request
Expand Down
82 changes: 79 additions & 3 deletions vllm/entrypoints/openai/serving_responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from contextlib import AsyncExitStack
from copy import copy
from http import HTTPStatus
from typing import Final
from typing import Final, cast

import jinja2
from fastapi import Request
Expand Down Expand Up @@ -57,6 +57,8 @@
from vllm.entrypoints.chat_utils import (
ChatCompletionMessageParam,
ChatTemplateContentFormatOption,
ConversationMessage,
apply_hf_chat_template,
)
from vllm.entrypoints.context import (
ConversationContext,
Expand Down Expand Up @@ -352,8 +354,8 @@ async def create_responses(
tokenizer = await self.engine_client.get_tokenizer()

if self.use_harmony:
messages, engine_prompts = self._make_request_with_harmony(
request, prev_response
messages, request_prompts, engine_prompts = (
self._make_request_with_harmony(request, prev_response, tokenizer)
)
else:
messages, engine_prompts = await self._make_request(
Expand Down Expand Up @@ -577,13 +579,87 @@ def _make_request_with_harmony(
self,
request: ResponsesRequest,
prev_response: ResponsesResponse | None,
tokenizer: TokenizerLike,
):
if request.tool_choice != "auto":
raise NotImplementedError(
"Only 'auto' tool_choice is supported in response API with Harmony"
)
messages = self._construct_input_messages_with_harmony(request, prev_response)
prompt_token_ids = render_for_completion(messages)

# If a chat template is provided, allow rendering through it even in
# Harmony mode, then tokenize. On failure, fall back to Harmony render.
if self.chat_template:
try:
conversation: list[ConversationMessage] = []
if request.instructions:
conversation.append(
{"role": "system", "content": request.instructions}
)

if isinstance(request.input, str):
conversation.append({"role": "user", "content": request.input})
elif isinstance(request.input, list):
for item in request.input:
if hasattr(item, "model_dump"):
Comment on lines +601 to +605
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Keep Harmony conversation history in responses templating

In the Harmony Responses path, the chat-template logic constructs conversation only from the current instructions and input before re-rendering, and that templated output overwrites the earlier prompt_token_ids. Because no prior turns or Harmony system/developer prompts from _construct_input_messages_with_harmony are included, any multi-turn Harmony conversation loses all previous context as soon as a server chat template is configured and the template renders successfully.

Useful? React with 👍 / 👎.

conversation.append(item.model_dump(exclude_none=True))
elif isinstance(item, dict):
conversation.append(item) # type: ignore[arg-type]
else:
conversation.append(cast(ConversationMessage, item))

tools = None
if request.tools:
tools = [tool.model_dump() for tool in request.tools]

prompt_text = apply_hf_chat_template(
tokenizer=getattr(tokenizer, "hf_tokenizer", tokenizer),
conversation=conversation,
chat_template=self.chat_template,
tools=tools,
model_config=self.model_config,
)

tokenized = tokenizer(prompt_text, add_special_tokens=False)
rendered_ids = (
tokenized["input_ids"] if isinstance(tokenized, dict) else tokenized
)
normalized_ids: list[int] | None = None
if isinstance(rendered_ids, list):
if rendered_ids and all(isinstance(t, int) for t in rendered_ids):
normalized_ids = rendered_ids
elif rendered_ids and isinstance(rendered_ids[0], list):
candidate = rendered_ids[0]
if all(isinstance(t, int) for t in candidate):
normalized_ids = candidate
elif hasattr(rendered_ids, "tolist"):
flat = rendered_ids.tolist()
if flat and isinstance(flat[0], list):
flat = flat[0]
if flat and all(isinstance(t, int) for t in flat):
normalized_ids = flat

if normalized_ids is None and hasattr(tokenizer, "encode"):
try:
normalized_ids = tokenizer.encode(
prompt_text, add_special_tokens=False
)
except Exception:
normalized_ids = None

if normalized_ids is not None:
prompt_token_ids = normalized_ids
else:
logger.warning(
"Harmony chat template returned non-int tokens; "
"falling back to Harmony default"
)
except Exception:
logger.warning(
"Harmony chat template rendering failed; using Harmony default",
exc_info=True,
)
engine_prompt = TokensPrompt(prompt_token_ids=prompt_token_ids)

# Add cache_salt if provided in the request
Expand Down