From acc9661b79de36c6ab78bfdd72c0821b41ed00de Mon Sep 17 00:00:00 2001 From: Vladislav Nosivskoy Date: Fri, 5 Dec 2025 18:56:35 +0300 Subject: [PATCH 1/3] fix guided decoding params handling in vllm Signed-off-by: Vladislav Nosivskoy --- components/src/dynamo/vllm/handlers.py | 20 ++++++++++-- tests/serve/test_vllm.py | 45 ++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 2 deletions(-) diff --git a/components/src/dynamo/vllm/handlers.py b/components/src/dynamo/vllm/handlers.py index 55ee6ffcf31..a36d51a541e 100644 --- a/components/src/dynamo/vllm/handlers.py +++ b/components/src/dynamo/vllm/handlers.py @@ -12,7 +12,7 @@ from vllm.inputs import TokensPrompt from vllm.lora.request import LoRARequest from vllm.outputs import RequestOutput -from vllm.sampling_params import SamplingParams +from vllm.sampling_params import SamplingParams, StructuredOutputsParams from vllm.v1.engine.exceptions import EngineDeadError from dynamo.llm import ( @@ -82,8 +82,24 @@ def build_sampling_params( sampling_params = SamplingParams(**default_sampling_params) sampling_params.detokenize = False - # Apply sampling_options + # Handle guided_decoding - convert to StructuredOutputsParams + guided_decoding = request["sampling_options"].get("guided_decoding") + if guided_decoding is not None and isinstance(guided_decoding, dict): + sampling_params.structured_outputs = StructuredOutputsParams( + json=guided_decoding.get("json"), + regex=guided_decoding.get("regex"), + choice=guided_decoding.get("choice"), + grammar=guided_decoding.get("grammar"), + whitespace_pattern=guided_decoding.get("whitespace_pattern"), + ) + if "backend" in guided_decoding and guided_decoding["backend"] is not None: + sampling_params.structured_outputs._backend = guided_decoding["backend"] + + # Apply remaining sampling_options for key, value in request["sampling_options"].items(): + # Skip guided_decoding - already handled above + if key == "guided_decoding": + continue if value is not None and hasattr(sampling_params, key): setattr(sampling_params, key, value) diff --git a/tests/serve/test_vllm.py b/tests/serve/test_vllm.py index 4bd0db74712..a7934371f22 100644 --- a/tests/serve/test_vllm.py +++ b/tests/serve/test_vllm.py @@ -428,6 +428,51 @@ class VLLMConfig(EngineConfig): completion_payload_default(), ], ), + "guided_decoding_json": VLLMConfig( + name="guided_decoding_json", + directory=vllm_dir, + script_name="agg.sh", + marks=[pytest.mark.gpu_1, pytest.mark.pre_merge], + model="Qwen/Qwen3-0.6B", + request_payloads=[ + chat_payload( + "Generate a person with name and age", + repeat_count=1, + expected_response=['"name"', '"age"'], + temperature=0.0, + max_tokens=100, + extra_body={ + "guided_json": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "integer"}, + }, + "required": ["name", "age"], + } + }, + ) + ], + ), + "guided_decoding_regex": VLLMConfig( + name="guided_decoding_regex", + directory=vllm_dir, + script_name="agg.sh", + marks=[pytest.mark.gpu_1, pytest.mark.pre_merge], + model="Qwen/Qwen3-0.6B", + request_payloads=[ + chat_payload( + "Generate an email address", + repeat_count=1, + expected_response=["@"], + temperature=0.0, + max_tokens=50, + extra_body={ + "guided_regex": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}" + }, + ) + ], + ), } From 34f3937f5fe6190553745b9e9a16885998892e84 Mon Sep 17 00:00:00 2001 From: Vladislav Nosivskoy Date: Fri, 5 Dec 2025 19:18:36 +0300 Subject: [PATCH 2/3] remove backend from request-level params Signed-off-by: Vladislav Nosivskoy --- components/src/dynamo/vllm/handlers.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/components/src/dynamo/vllm/handlers.py b/components/src/dynamo/vllm/handlers.py index a36d51a541e..32fed9afc07 100644 --- a/components/src/dynamo/vllm/handlers.py +++ b/components/src/dynamo/vllm/handlers.py @@ -92,8 +92,6 @@ def build_sampling_params( grammar=guided_decoding.get("grammar"), whitespace_pattern=guided_decoding.get("whitespace_pattern"), ) - if "backend" in guided_decoding and guided_decoding["backend"] is not None: - sampling_params.structured_outputs._backend = guided_decoding["backend"] # Apply remaining sampling_options for key, value in request["sampling_options"].items(): From 5bbfc815e41a57c50212084c30e70c6c07353120 Mon Sep 17 00:00:00 2001 From: Vladislav Nosivskoy Date: Fri, 5 Dec 2025 19:39:49 +0300 Subject: [PATCH 3/3] add extra_body in payload builder Signed-off-by: Vladislav Nosivskoy --- tests/serve/test_vllm.py | 27 +++++++++++++++++++++------ tests/utils/payload_builder.py | 4 ++++ 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/tests/serve/test_vllm.py b/tests/serve/test_vllm.py index a7934371f22..e344be2ea24 100644 --- a/tests/serve/test_vllm.py +++ b/tests/serve/test_vllm.py @@ -462,14 +462,29 @@ class VLLMConfig(EngineConfig): model="Qwen/Qwen3-0.6B", request_payloads=[ chat_payload( - "Generate an email address", + "Generate a color name (red, blue, or green)", repeat_count=1, - expected_response=["@"], + expected_response=["red", "blue", "green"], temperature=0.0, - max_tokens=50, - extra_body={ - "guided_regex": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}" - }, + max_tokens=20, + extra_body={"guided_regex": r"(red|blue|green)"}, + ) + ], + ), + "guided_decoding_choice": VLLMConfig( + name="guided_decoding_choice", + directory=vllm_dir, + script_name="agg.sh", + marks=[pytest.mark.gpu_1, pytest.mark.pre_merge], + model="Qwen/Qwen3-0.6B", + request_payloads=[ + chat_payload( + "Generate a color name (red, blue, or green)", + repeat_count=1, + expected_response=["red", "blue", "green"], + temperature=0.0, + max_tokens=20, + extra_body={"guided_choice": ["red", "blue", "green"]}, ) ], ), diff --git a/tests/utils/payload_builder.py b/tests/utils/payload_builder.py index 1b2e8bf9631..e0bf978b997 100644 --- a/tests/utils/payload_builder.py +++ b/tests/utils/payload_builder.py @@ -134,6 +134,7 @@ def chat_payload( max_tokens: int = 300, temperature: Optional[float] = None, stream: bool = False, + extra_body: Optional[Dict[str, Any]] = None, ) -> ChatPayload: body: Dict[str, Any] = { "messages": [ @@ -148,6 +149,9 @@ def chat_payload( if temperature is not None: body["temperature"] = temperature + if extra_body: + body.update(extra_body) + return ChatPayload( body=body, repeat_count=repeat_count,