-
-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[Frontend] Honor chat template for gpt-oss harmony (#23015) #30482
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,7 +10,7 @@ | |
| from contextlib import AsyncExitStack | ||
| from copy import copy | ||
| from http import HTTPStatus | ||
| from typing import Final | ||
| from typing import Final, cast | ||
|
|
||
| import jinja2 | ||
| from fastapi import Request | ||
|
|
@@ -57,6 +57,8 @@ | |
| from vllm.entrypoints.chat_utils import ( | ||
| ChatCompletionMessageParam, | ||
| ChatTemplateContentFormatOption, | ||
| ConversationMessage, | ||
| apply_hf_chat_template, | ||
| ) | ||
| from vllm.entrypoints.context import ( | ||
| ConversationContext, | ||
|
|
@@ -352,8 +354,8 @@ async def create_responses( | |
| tokenizer = await self.engine_client.get_tokenizer() | ||
|
|
||
| if self.use_harmony: | ||
| messages, engine_prompts = self._make_request_with_harmony( | ||
| request, prev_response | ||
| messages, request_prompts, engine_prompts = ( | ||
| self._make_request_with_harmony(request, prev_response, tokenizer) | ||
| ) | ||
| else: | ||
| messages, engine_prompts = await self._make_request( | ||
|
|
@@ -577,13 +579,87 @@ def _make_request_with_harmony( | |
| self, | ||
| request: ResponsesRequest, | ||
| prev_response: ResponsesResponse | None, | ||
| tokenizer: TokenizerLike, | ||
| ): | ||
| if request.tool_choice != "auto": | ||
| raise NotImplementedError( | ||
| "Only 'auto' tool_choice is supported in response API with Harmony" | ||
| ) | ||
| messages = self._construct_input_messages_with_harmony(request, prev_response) | ||
| prompt_token_ids = render_for_completion(messages) | ||
|
|
||
| # If a chat template is provided, allow rendering through it even in | ||
| # Harmony mode, then tokenize. On failure, fall back to Harmony render. | ||
| if self.chat_template: | ||
| try: | ||
| conversation: list[ConversationMessage] = [] | ||
| if request.instructions: | ||
| conversation.append( | ||
| {"role": "system", "content": request.instructions} | ||
| ) | ||
|
|
||
| if isinstance(request.input, str): | ||
| conversation.append({"role": "user", "content": request.input}) | ||
| elif isinstance(request.input, list): | ||
| for item in request.input: | ||
| if hasattr(item, "model_dump"): | ||
|
Comment on lines
+601
to
+605
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
In the Harmony Responses path, the chat-template logic constructs Useful? React with 👍 / 👎. |
||
| conversation.append(item.model_dump(exclude_none=True)) | ||
| elif isinstance(item, dict): | ||
| conversation.append(item) # type: ignore[arg-type] | ||
| else: | ||
| conversation.append(cast(ConversationMessage, item)) | ||
|
|
||
| tools = None | ||
| if request.tools: | ||
| tools = [tool.model_dump() for tool in request.tools] | ||
|
|
||
| prompt_text = apply_hf_chat_template( | ||
| tokenizer=getattr(tokenizer, "hf_tokenizer", tokenizer), | ||
| conversation=conversation, | ||
| chat_template=self.chat_template, | ||
| tools=tools, | ||
| model_config=self.model_config, | ||
| ) | ||
|
|
||
| tokenized = tokenizer(prompt_text, add_special_tokens=False) | ||
| rendered_ids = ( | ||
| tokenized["input_ids"] if isinstance(tokenized, dict) else tokenized | ||
| ) | ||
| normalized_ids: list[int] | None = None | ||
| if isinstance(rendered_ids, list): | ||
| if rendered_ids and all(isinstance(t, int) for t in rendered_ids): | ||
| normalized_ids = rendered_ids | ||
| elif rendered_ids and isinstance(rendered_ids[0], list): | ||
| candidate = rendered_ids[0] | ||
| if all(isinstance(t, int) for t in candidate): | ||
| normalized_ids = candidate | ||
| elif hasattr(rendered_ids, "tolist"): | ||
| flat = rendered_ids.tolist() | ||
| if flat and isinstance(flat[0], list): | ||
| flat = flat[0] | ||
| if flat and all(isinstance(t, int) for t in flat): | ||
| normalized_ids = flat | ||
|
|
||
| if normalized_ids is None and hasattr(tokenizer, "encode"): | ||
| try: | ||
| normalized_ids = tokenizer.encode( | ||
| prompt_text, add_special_tokens=False | ||
| ) | ||
| except Exception: | ||
| normalized_ids = None | ||
|
|
||
| if normalized_ids is not None: | ||
| prompt_token_ids = normalized_ids | ||
| else: | ||
| logger.warning( | ||
| "Harmony chat template returned non-int tokens; " | ||
| "falling back to Harmony default" | ||
| ) | ||
| except Exception: | ||
| logger.warning( | ||
| "Harmony chat template rendering failed; using Harmony default", | ||
| exc_info=True, | ||
| ) | ||
| engine_prompt = TokensPrompt(prompt_token_ids=prompt_token_ids) | ||
|
|
||
| # Add cache_salt if provided in the request | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When a
chat_templateis configured in the Harmony chat-completions path, the template rendering rebuildsconversationsolely fromrequest.messagesand then replacesprompt_token_idswith the templated output. That block omits the Harmony-specific system and developer messages assembled just above (tool guidance, reasoning-effort instructions, etc.), so successful template rendering sends the model a prompt without those guardrails or tool metadata whenever a server chat template is enabled.Useful? React with 👍 / 👎.