From acc9661b79de36c6ab78bfdd72c0821b41ed00de Mon Sep 17 00:00:00 2001
From: Vladislav Nosivskoy <vladnosiv@gmail.com>
Date: Fri, 5 Dec 2025 18:56:35 +0300
Subject: [PATCH 1/3] fix guided decoding params handling in vllm

Signed-off-by: Vladislav Nosivskoy <vladnosiv@gmail.com>
---
 components/src/dynamo/vllm/handlers.py | 20 ++++++++++--
 tests/serve/test_vllm.py               | 45 ++++++++++++++++++++++++++
 2 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/components/src/dynamo/vllm/handlers.py b/components/src/dynamo/vllm/handlers.py
index 55ee6ffcf31..a36d51a541e 100644
--- a/components/src/dynamo/vllm/handlers.py
+++ b/components/src/dynamo/vllm/handlers.py
@@ -12,7 +12,7 @@
 from vllm.inputs import TokensPrompt
 from vllm.lora.request import LoRARequest
 from vllm.outputs import RequestOutput
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import SamplingParams, StructuredOutputsParams
 from vllm.v1.engine.exceptions import EngineDeadError
 
 from dynamo.llm import (
@@ -82,8 +82,24 @@ def build_sampling_params(
     sampling_params = SamplingParams(**default_sampling_params)
     sampling_params.detokenize = False
 
-    # Apply sampling_options
+    # Handle guided_decoding - convert to StructuredOutputsParams
+    guided_decoding = request["sampling_options"].get("guided_decoding")
+    if guided_decoding is not None and isinstance(guided_decoding, dict):
+        sampling_params.structured_outputs = StructuredOutputsParams(
+            json=guided_decoding.get("json"),
+            regex=guided_decoding.get("regex"),
+            choice=guided_decoding.get("choice"),
+            grammar=guided_decoding.get("grammar"),
+            whitespace_pattern=guided_decoding.get("whitespace_pattern"),
+        )
+        if "backend" in guided_decoding and guided_decoding["backend"] is not None:
+            sampling_params.structured_outputs._backend = guided_decoding["backend"]
+
+    # Apply remaining sampling_options
     for key, value in request["sampling_options"].items():
+        # Skip guided_decoding - already handled above
+        if key == "guided_decoding":
+            continue
         if value is not None and hasattr(sampling_params, key):
             setattr(sampling_params, key, value)
 
diff --git a/tests/serve/test_vllm.py b/tests/serve/test_vllm.py
index 4bd0db74712..a7934371f22 100644
--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -428,6 +428,51 @@ class VLLMConfig(EngineConfig):
             completion_payload_default(),
         ],
     ),
+    "guided_decoding_json": VLLMConfig(
+        name="guided_decoding_json",
+        directory=vllm_dir,
+        script_name="agg.sh",
+        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
+        model="Qwen/Qwen3-0.6B",
+        request_payloads=[
+            chat_payload(
+                "Generate a person with name and age",
+                repeat_count=1,
+                expected_response=['"name"', '"age"'],
+                temperature=0.0,
+                max_tokens=100,
+                extra_body={
+                    "guided_json": {
+                        "type": "object",
+                        "properties": {
+                            "name": {"type": "string"},
+                            "age": {"type": "integer"},
+                        },
+                        "required": ["name", "age"],
+                    }
+                },
+            )
+        ],
+    ),
+    "guided_decoding_regex": VLLMConfig(
+        name="guided_decoding_regex",
+        directory=vllm_dir,
+        script_name="agg.sh",
+        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
+        model="Qwen/Qwen3-0.6B",
+        request_payloads=[
+            chat_payload(
+                "Generate an email address",
+                repeat_count=1,
+                expected_response=["@"],
+                temperature=0.0,
+                max_tokens=50,
+                extra_body={
+                    "guided_regex": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
+                },
+            )
+        ],
+    ),
 }
 
 

From 34f3937f5fe6190553745b9e9a16885998892e84 Mon Sep 17 00:00:00 2001
From: Vladislav Nosivskoy <vladnosiv@gmail.com>
Date: Fri, 5 Dec 2025 19:18:36 +0300
Subject: [PATCH 2/3] remove backend from request-level params

Signed-off-by: Vladislav Nosivskoy <vladnosiv@gmail.com>
---
 components/src/dynamo/vllm/handlers.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/components/src/dynamo/vllm/handlers.py b/components/src/dynamo/vllm/handlers.py
index a36d51a541e..32fed9afc07 100644
--- a/components/src/dynamo/vllm/handlers.py
+++ b/components/src/dynamo/vllm/handlers.py
@@ -92,8 +92,6 @@ def build_sampling_params(
             grammar=guided_decoding.get("grammar"),
             whitespace_pattern=guided_decoding.get("whitespace_pattern"),
         )
-        if "backend" in guided_decoding and guided_decoding["backend"] is not None:
-            sampling_params.structured_outputs._backend = guided_decoding["backend"]
 
     # Apply remaining sampling_options
     for key, value in request["sampling_options"].items():

From 5bbfc815e41a57c50212084c30e70c6c07353120 Mon Sep 17 00:00:00 2001
From: Vladislav Nosivskoy <vladnosiv@gmail.com>
Date: Fri, 5 Dec 2025 19:39:49 +0300
Subject: [PATCH 3/3] add extra_body in payload builder

Signed-off-by: Vladislav Nosivskoy <vladnosiv@gmail.com>
---
 tests/serve/test_vllm.py       | 27 +++++++++++++++++++++------
 tests/utils/payload_builder.py |  4 ++++
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/tests/serve/test_vllm.py b/tests/serve/test_vllm.py
index a7934371f22..e344be2ea24 100644
--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -462,14 +462,29 @@ class VLLMConfig(EngineConfig):
         model="Qwen/Qwen3-0.6B",
         request_payloads=[
             chat_payload(
-                "Generate an email address",
+                "Generate a color name (red, blue, or green)",
                 repeat_count=1,
-                expected_response=["@"],
+                expected_response=["red", "blue", "green"],
                 temperature=0.0,
-                max_tokens=50,
-                extra_body={
-                    "guided_regex": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
-                },
+                max_tokens=20,
+                extra_body={"guided_regex": r"(red|blue|green)"},
+            )
+        ],
+    ),
+    "guided_decoding_choice": VLLMConfig(
+        name="guided_decoding_choice",
+        directory=vllm_dir,
+        script_name="agg.sh",
+        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
+        model="Qwen/Qwen3-0.6B",
+        request_payloads=[
+            chat_payload(
+                "Generate a color name (red, blue, or green)",
+                repeat_count=1,
+                expected_response=["red", "blue", "green"],
+                temperature=0.0,
+                max_tokens=20,
+                extra_body={"guided_choice": ["red", "blue", "green"]},
             )
         ],
     ),
diff --git a/tests/utils/payload_builder.py b/tests/utils/payload_builder.py
index 1b2e8bf9631..e0bf978b997 100644
--- a/tests/utils/payload_builder.py
+++ b/tests/utils/payload_builder.py
@@ -134,6 +134,7 @@ def chat_payload(
     max_tokens: int = 300,
     temperature: Optional[float] = None,
     stream: bool = False,
+    extra_body: Optional[Dict[str, Any]] = None,
 ) -> ChatPayload:
     body: Dict[str, Any] = {
         "messages": [
@@ -148,6 +149,9 @@ def chat_payload(
     if temperature is not None:
         body["temperature"] = temperature
 
+    if extra_body:
+        body.update(extra_body)
+
     return ChatPayload(
         body=body,
         repeat_count=repeat_count,