Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
327a154
feat: harmony tool supports for /chat/completions
aarnphm Aug 6, 2025
f03ea94
merge: branch 'main' of github.com:vllm-project/vllm into feat/gpt-os…
aarnphm Aug 25, 2025
d093940
chore: add test cases for serving chat
aarnphm Aug 25, 2025
72f83a0
chore: update interfaces
aarnphm Aug 25, 2025
a891510
fix: linter issue
aarnphm Aug 25, 2025
3d029bd
fix: correct types and lint
aarnphm Aug 25, 2025
bd57e40
fix: tests
aarnphm Aug 25, 2025
7b3dce2
Merge branch 'main' into feat/gpt-oss-fc
simon-mo Aug 26, 2025
a806f00
fix: address comments
aarnphm Sep 2, 2025
01d3ce8
merge: branch 'main' of github.com:vllm-project/vllm into feat/gpt-os…
aarnphm Sep 2, 2025
12f18c3
chore: enable tests for gpt-oss e2e
aarnphm Sep 2, 2025
62bbad3
revert: remove interfaces change and keep gpt-oss separate
aarnphm Sep 2, 2025
629f974
chore: update test chat
aarnphm Sep 2, 2025
503f98f
chore: cleanup separate path for reasoning contents
aarnphm Sep 2, 2025
c871dd0
chore: final styling
aarnphm Sep 3, 2025
6c4f42b
merge: branch 'main' of github.com:vllm-project/vllm into feat/gpt-os…
aarnphm Sep 3, 2025
ad75bee
fix: import issue
aarnphm Sep 3, 2025
28764fe
fix: correct tests and remove reasoning content
aarnphm Sep 3, 2025
487ec94
fix: tests
aarnphm Sep 3, 2025
f6aa5d5
merge: branch 'main' of github.com:vllm-project/vllm into feat/gpt-os…
aarnphm Sep 3, 2025
f0ee88a
merge: branch 'main' of github.com:vllm-project/vllm into feat/gpt-os…
aarnphm Sep 4, 2025
9588a55
fix: correct fixed names
aarnphm Sep 4, 2025
d9b3d7f
merge: branch 'main' of github.com:vllm-project/vllm into feat/gpt-os…
aarnphm Sep 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 77 additions & 1 deletion tests/entrypoints/openai/test_serving_chat.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from __future__ import annotations

import asyncio
from contextlib import suppress
from dataclasses import dataclass, field
from typing import Any, Optional
from typing import TYPE_CHECKING, Any, Optional
from unittest.mock import MagicMock

import pytest
import pytest_asyncio

from vllm.config import MultiModalConfig
from vllm.engine.multiprocessing.client import MQLLMEngineClient
Expand All @@ -17,6 +20,79 @@
OpenAIServingModels)
from vllm.transformers_utils.tokenizer import get_tokenizer

from ...utils import RemoteOpenAIServer

if TYPE_CHECKING:
from openai import OpenAI

GPT_OSS_MODEL_NAME = "openai/gpt-oss-20b"


@pytest.fixture(scope="module")
def gptoss_server():
args = ["--enforce-eager"]
with RemoteOpenAIServer(GPT_OSS_MODEL_NAME, args) as remote_server:
yield remote_server


@pytest_asyncio.fixture
async def gptoss_client(gptoss_server):
async with gptoss_server.get_async_client() as async_client:
yield async_client


@pytest.mark.asyncio
@pytest.mark.skip(reason="gpt-oss can't run on CI yet.")
async def test_gpt_oss_chat_tool_call_streaming(gptoss_client: OpenAI):
tools = [{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string"
},
"state": {
"type": "string"
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["city", "state", "unit"],
},
},
}]

messages = [
{
"role": "user",
"content": "What is the weather in Dallas, TX?"
},
]

stream = await gptoss_client.chat.completions.create(
model=GPT_OSS_MODEL_NAME, messages=messages, tools=tools, stream=True)

name = None
args_buf = ""
async for chunk in stream:
delta = chunk.choices[0].delta
if delta.tool_calls:
tc = delta.tool_calls[0]
if tc.function and tc.function.name:
name = tc.function.name
if tc.function and tc.function.arguments:
args_buf += tc.function.arguments

assert name is not None
assert len(args_buf) > 0


MODEL_NAME = "openai-community/gpt2"
CHAT_TEMPLATE = "Dummy chat template for testing {}"
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
Expand Down
132 changes: 132 additions & 0 deletions tests/tool_use/test_openai_tool_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import json

import pytest
from openai_harmony import (HarmonyEncodingName, Message, Role,
load_harmony_encoding)

from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
from vllm.entrypoints.openai.tool_parsers import OpenAIToolParser
from vllm.transformers_utils.tokenizer import get_tokenizer

MODEL = "gpt2"


@pytest.fixture(scope="module")
def openai_tokenizer():
# The parser does not use the tokenizer, but the constructor requires it.
return get_tokenizer(MODEL)


@pytest.fixture
def openai_tool_parser(openai_tokenizer):
return OpenAIToolParser(openai_tokenizer)


@pytest.fixture(scope="module")
def harmony_encoding():
return load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)


def assert_tool_calls(actual_tool_calls: list[ToolCall],
expected_tool_calls: list[ToolCall]):
assert len(actual_tool_calls) == len(expected_tool_calls)

for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
expected_tool_calls):
assert isinstance(actual_tool_call.id, str)
assert len(actual_tool_call.id) > 16 # Default from protocol.py
assert actual_tool_call.type == "function"
assert actual_tool_call.function == expected_tool_call.function


def test_extract_tool_calls_no_tools(openai_tool_parser, harmony_encoding):
msg = Message.from_role_and_content(Role.ASSISTANT,
"This is a test").with_channel("final")
stop_token = harmony_encoding.token_from_string("<|return|>")
token_ids = harmony_encoding.render_message(msg) + [stop_token]

extracted_info = openai_tool_parser.extract_tool_calls("",
request=None,
token_ids=token_ids)
assert not extracted_info.tools_called
assert extracted_info.tool_calls == []
assert extracted_info.content == "This is a test"


def test_extract_tool_calls_single_tool(openai_tool_parser, harmony_encoding):
msg = Message.from_role_and_content(
Role.ASSISTANT, '{"city": "Dallas"}').with_channel("commentary"). \
with_recipient("functions.get_current_weather").with_content_type("json")
stop_token = harmony_encoding.token_from_string("<|call|>")
token_ids = harmony_encoding.render_message(msg) + [stop_token]

extracted_info = openai_tool_parser.extract_tool_calls("",
request=None,
token_ids=token_ids)
assert extracted_info.tools_called
expected_tool_calls = [
ToolCall(
function=FunctionCall(name="get_current_weather",
arguments=json.dumps({"city": "Dallas"})))
]
assert_tool_calls(extracted_info.tool_calls, expected_tool_calls)
assert extracted_info.content is None


def test_extract_tool_calls_multiple_tools(openai_tool_parser,
harmony_encoding):
msg1 = Message.from_role_and_content(
Role.ASSISTANT, '{"city": "Dallas"}').with_channel("commentary"). \
with_recipient("functions.get_current_weather").with_content_type("json")
msg2 = Message.from_role_and_content(
Role.ASSISTANT, '{}').with_channel("commentary"). \
with_recipient("functions.get_user_location").with_content_type("json")
stop_token = harmony_encoding.token_from_string("<|call|>")
token_ids = harmony_encoding.render_message(
msg1) + harmony_encoding.render_message(msg2) + [stop_token]

extracted_info = openai_tool_parser.extract_tool_calls("",
request=None,
token_ids=token_ids)
assert extracted_info.tools_called
expected_tool_calls = [
ToolCall(
function=FunctionCall(name="get_current_weather",
arguments=json.dumps({"city": "Dallas"}))),
ToolCall(function=FunctionCall(name="get_user_location",
arguments=json.dumps({})))
]
assert_tool_calls(extracted_info.tool_calls, expected_tool_calls)
assert extracted_info.content is None


def test_extract_tool_calls_with_reasoning(openai_tool_parser,
harmony_encoding):
msg1 = Message.from_role_and_content(
Role.ASSISTANT, "Thinking about the weather.").with_channel("analysis")
msg2 = Message.from_role_and_content(
Role.ASSISTANT, '{"city": "Dallas"}').with_channel("commentary"). \
with_recipient("functions.get_current_weather").with_content_type("json")
msg3 = Message.from_role_and_content(
Role.ASSISTANT, "The weather is nice.").with_channel("final")

stop_token = harmony_encoding.token_from_string("<|return|>")
token_ids = harmony_encoding.render_message(
msg1) + harmony_encoding.render_message(
msg2) + harmony_encoding.render_message(msg3) + [stop_token]

extracted_info = openai_tool_parser.extract_tool_calls("",
request=None,
token_ids=token_ids)
assert extracted_info.tools_called
assert extracted_info.reasoning_content == "Thinking about the weather."
expected_tool_calls = [
ToolCall(
function=FunctionCall(name="get_current_weather",
arguments=json.dumps({"city": "Dallas"})))
]
assert_tool_calls(extracted_info.tool_calls, expected_tool_calls)
assert extracted_info.content == "The weather is nice."
62 changes: 51 additions & 11 deletions vllm/entrypoints/harmony_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
Role, StreamableParser, SystemContent, TextContent,
ToolDescription, load_harmony_encoding)

from vllm.entrypoints.openai.protocol import ResponseInputOutputItem
from vllm.entrypoints.openai.protocol import (ChatCompletionToolsParam,
ResponseInputOutputItem)
from vllm.utils import random_uuid

REASONING_EFFORT = {
Expand Down Expand Up @@ -63,6 +64,20 @@ def get_system_message(
return sys_msg


def create_tool_definition(tool):
if isinstance(tool, ChatCompletionToolsParam):
return ToolDescription.new(
name=tool.function.name,
description=tool.function.description,
parameters=tool.function.parameters,
)
return ToolDescription.new(
name=tool.name,
description=tool.description,
parameters=tool.parameters,
)


def get_developer_message(instructions: Optional[str] = None,
tools: Optional[list[Tool]] = None) -> Message:
dev_msg_content = DeveloperContent.new()
Expand All @@ -80,11 +95,7 @@ def get_developer_message(instructions: Optional[str] = None,
raise ValueError(f"tool type {tool.type} not supported")
if function_tools:
function_tool_descriptions = [
ToolDescription.new(
name=tool.name,
description=tool.description,
parameters=tool.parameters,
) for tool in function_tools
create_tool_definition(tool) for tool in function_tools
]
dev_msg_content = dev_msg_content.with_function_tools(
function_tool_descriptions)
Expand Down Expand Up @@ -148,16 +159,45 @@ def parse_response_input(
return msg


def parse_chat_input(chat_msg) -> Message:
role = chat_msg["role"]
content = chat_msg["content"]
def parse_chat_input(chat_msg) -> list[Message]:
if not isinstance(chat_msg, dict):
# Handle Pydantic models
chat_msg = chat_msg.model_dump(exclude_none=True)

role = chat_msg.get("role")

# Assistant message with tool calls
tool_calls = chat_msg.get("tool_calls")
if role == "assistant" and tool_calls:
msgs: list[Message] = []
for call in tool_calls:
func = call.get("function", {})
name = func.get("name", "")
arguments = func.get("arguments", "") or ""
msg = Message.from_role_and_content(Role.ASSISTANT, arguments)
msg = msg.with_channel("commentary")
msg = msg.with_recipient(f"functions.{name}")
msg = msg.with_content_type("json")
msgs.append(msg)
return msgs

# Tool role message (tool output)
if role == "tool":
name = chat_msg.get("name", "")
content = chat_msg.get("content", "") or ""
msg = Message.from_author_and_content(
Author.new(Role.TOOL, f"functions.{name}"), content)
return [msg]

# Default: user/assistant/system messages with content
content = chat_msg.get("content", "")
if isinstance(content, str):
contents = [TextContent(text=content)]
else:
# TODO: Support refusal.
contents = [TextContent(text=c["text"]) for c in content]
contents = [TextContent(text=c.get("text", "")) for c in content]
msg = Message.from_role_and_contents(role, contents)
return msg
return [msg]


def render_for_completion(messages: list[Message]) -> list[int]:
Expand Down
Loading