diff --git a/python/openai/README.md b/python/openai/README.md index 7d811b782e..0c7c70346e 100644 --- a/python/openai/README.md +++ b/python/openai/README.md @@ -606,10 +606,9 @@ tool calling result: The weather in Dallas, Texas is 85 degrees fahrenheit. It i #### Named Tool Calling -The OpenAI frontend supports named function calling, utilizing guided decoding in the vLLM and TensorRT-LLM backends. Users can specify one of the tools in `tool_choice` to force the model to select a specific tool for function calling. +The OpenAI frontend supports named function calling, utilizing structured outputs in the vLLM backend and guided decoding in TensorRT-LLM backend. Users can specify one of the tools in `tool_choice` to force the model to select a specific tool for function calling. > [!NOTE] -> The latest release of TensorRT-LLM (v0.18.0) does not yet support guided decoding. To enable this feature, use a build from the main branch of TensorRT-LLM. > For instructions on enabling guided decoding in the TensorRT-LLM backend, please refer to [this guide](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/docs/guided_decoding.md) Example for making a named tool calling request: diff --git a/python/openai/openai_frontend/engine/triton_engine.py b/python/openai/openai_frontend/engine/triton_engine.py index 832c9ee710..4fe8ca5345 100644 --- a/python/openai/openai_frontend/engine/triton_engine.py +++ b/python/openai/openai_frontend/engine/triton_engine.py @@ -678,7 +678,8 @@ def _get_streaming_response_delta( # check to make sure we haven't "forgotten" to stream # any tokens that were generated but previously # matched by partial json parsing, such as '}'. - # only happens if we are NOT using guided decoding + # only happens if we are NOT using structured outputs + # or guided decoding if ( self._should_check_for_unstreamed_tool_arg_tokens( response_delta=response_delta, diff --git a/python/openai/openai_frontend/engine/utils/triton.py b/python/openai/openai_frontend/engine/utils/triton.py index a660f589e4..eb99b625f7 100644 --- a/python/openai/openai_frontend/engine/utils/triton.py +++ b/python/openai/openai_frontend/engine/utils/triton.py @@ -113,11 +113,11 @@ def _create_vllm_generate_request( guided_json = _get_guided_json_from_tool(request) if guided_json is not None: - from vllm.sampling_params import GuidedDecodingParams + from vllm.sampling_params import StructuredOutputsParams sampling_parameters_json = json.loads(sampling_parameters) - sampling_parameters_json["guided_decoding"] = json.dumps( - asdict(GuidedDecodingParams.from_optional(json=guided_json)) + sampling_parameters_json["structured_outputs"] = json.dumps( + asdict(StructuredOutputsParams.from_optional(json=guided_json)) ) sampling_parameters = json.dumps(sampling_parameters_json) diff --git a/python/openai/tests/test_tool_calling.py b/python/openai/tests/test_tool_calling.py index c738b96303..8f794cc62c 100644 --- a/python/openai/tests/test_tool_calling.py +++ b/python/openai/tests/test_tool_calling.py @@ -341,10 +341,6 @@ async def test_tool_call_with_reply_response( # validate if steaming and non-streaming generates the same content assert "".join(chunks) == choice.message.content - @pytest.mark.skipif( - os.environ.get("IMAGE_KIND") == "TRTLLM", - reason="latest release version of Tensorrt LLM 0.18 doesn't support guided decoding", - ) @pytest.mark.asyncio async def test_tool_call_with_named_tool_choice( self, client: openai.AsyncOpenAI, model: str @@ -448,10 +444,6 @@ async def test_tool_call_with_named_tool_choice( assert choice.message.role == role_name assert choice.message.tool_calls[0].function.name == function_name - @pytest.mark.skipif( - os.environ.get("IMAGE_KIND") == "TRTLLM", - reason="latest release version of Tensorrt LLM 0.18 doesn't support guided decoding", - ) @pytest.mark.asyncio async def test_tool_call_with_required_tool_choice( self, client: openai.AsyncOpenAI, model: str