Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions python/openai/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -606,10 +606,9 @@ tool calling result: The weather in Dallas, Texas is 85 degrees fahrenheit. It i

#### Named Tool Calling

The OpenAI frontend supports named function calling, utilizing guided decoding in the vLLM and TensorRT-LLM backends. Users can specify one of the tools in `tool_choice` to force the model to select a specific tool for function calling.
The OpenAI frontend supports named function calling, utilizing structured outputs in the vLLM backend and guided decoding in TensorRT-LLM backend. Users can specify one of the tools in `tool_choice` to force the model to select a specific tool for function calling.

> [!NOTE]
> The latest release of TensorRT-LLM (v0.18.0) does not yet support guided decoding. To enable this feature, use a build from the main branch of TensorRT-LLM.
> For instructions on enabling guided decoding in the TensorRT-LLM backend, please refer to [this guide](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/docs/guided_decoding.md)

Example for making a named tool calling request:
Expand Down
3 changes: 2 additions & 1 deletion python/openai/openai_frontend/engine/triton_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -678,7 +678,8 @@ def _get_streaming_response_delta(
# check to make sure we haven't "forgotten" to stream
# any tokens that were generated but previously
# matched by partial json parsing, such as '}'.
# only happens if we are NOT using guided decoding
# only happens if we are NOT using structured outputs
# or guided decoding
if (
self._should_check_for_unstreamed_tool_arg_tokens(
response_delta=response_delta,
Expand Down
6 changes: 3 additions & 3 deletions python/openai/openai_frontend/engine/utils/triton.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,11 +113,11 @@ def _create_vllm_generate_request(

guided_json = _get_guided_json_from_tool(request)
if guided_json is not None:
from vllm.sampling_params import GuidedDecodingParams
from vllm.sampling_params import StructuredOutputsParams

sampling_parameters_json = json.loads(sampling_parameters)
sampling_parameters_json["guided_decoding"] = json.dumps(
asdict(GuidedDecodingParams.from_optional(json=guided_json))
sampling_parameters_json["structured_outputs"] = json.dumps(
asdict(StructuredOutputsParams.from_optional(json=guided_json))
)
sampling_parameters = json.dumps(sampling_parameters_json)

Expand Down
8 changes: 0 additions & 8 deletions python/openai/tests/test_tool_calling.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,10 +341,6 @@ async def test_tool_call_with_reply_response(
# validate if steaming and non-streaming generates the same content
assert "".join(chunks) == choice.message.content

@pytest.mark.skipif(
os.environ.get("IMAGE_KIND") == "TRTLLM",
reason="latest release version of Tensorrt LLM 0.18 doesn't support guided decoding",
)
@pytest.mark.asyncio
async def test_tool_call_with_named_tool_choice(
self, client: openai.AsyncOpenAI, model: str
Expand Down Expand Up @@ -448,10 +444,6 @@ async def test_tool_call_with_named_tool_choice(
assert choice.message.role == role_name
assert choice.message.tool_calls[0].function.name == function_name

@pytest.mark.skipif(
os.environ.get("IMAGE_KIND") == "TRTLLM",
reason="latest release version of Tensorrt LLM 0.18 doesn't support guided decoding",
)
@pytest.mark.asyncio
async def test_tool_call_with_required_tool_choice(
self, client: openai.AsyncOpenAI, model: str
Expand Down
Loading