Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions components/src/dynamo/vllm/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from vllm.inputs import TokensPrompt
from vllm.lora.request import LoRARequest
from vllm.outputs import RequestOutput
from vllm.sampling_params import SamplingParams
from vllm.sampling_params import SamplingParams, StructuredOutputsParams
from vllm.v1.engine.exceptions import EngineDeadError

from dynamo.llm import (
Expand Down Expand Up @@ -82,8 +82,22 @@ def build_sampling_params(
sampling_params = SamplingParams(**default_sampling_params)
sampling_params.detokenize = False

# Apply sampling_options
# Handle guided_decoding - convert to StructuredOutputsParams
guided_decoding = request["sampling_options"].get("guided_decoding")
if guided_decoding is not None and isinstance(guided_decoding, dict):
sampling_params.structured_outputs = StructuredOutputsParams(
json=guided_decoding.get("json"),
regex=guided_decoding.get("regex"),
choice=guided_decoding.get("choice"),
grammar=guided_decoding.get("grammar"),
whitespace_pattern=guided_decoding.get("whitespace_pattern"),
)

# Apply remaining sampling_options
for key, value in request["sampling_options"].items():
# Skip guided_decoding - already handled above
if key == "guided_decoding":
continue
if value is not None and hasattr(sampling_params, key):
setattr(sampling_params, key, value)

Expand Down
60 changes: 60 additions & 0 deletions tests/serve/test_vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,66 @@ class VLLMConfig(EngineConfig):
completion_payload_default(),
],
),
"guided_decoding_json": VLLMConfig(
name="guided_decoding_json",
directory=vllm_dir,
script_name="agg.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
model="Qwen/Qwen3-0.6B",
request_payloads=[
chat_payload(
"Generate a person with name and age",
repeat_count=1,
expected_response=['"name"', '"age"'],
temperature=0.0,
max_tokens=100,
extra_body={
"guided_json": {
"type": "object",
"properties": {
"name": {"type": "string"},
"age": {"type": "integer"},
},
"required": ["name", "age"],
}
},
)
],
),
"guided_decoding_regex": VLLMConfig(
name="guided_decoding_regex",
directory=vllm_dir,
script_name="agg.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
model="Qwen/Qwen3-0.6B",
request_payloads=[
chat_payload(
"Generate a color name (red, blue, or green)",
repeat_count=1,
expected_response=["red", "blue", "green"],
temperature=0.0,
max_tokens=20,
extra_body={"guided_regex": r"(red|blue|green)"},
)
],
),
"guided_decoding_choice": VLLMConfig(
name="guided_decoding_choice",
directory=vllm_dir,
script_name="agg.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
model="Qwen/Qwen3-0.6B",
request_payloads=[
chat_payload(
"Generate a color name (red, blue, or green)",
repeat_count=1,
expected_response=["red", "blue", "green"],
temperature=0.0,
max_tokens=20,
extra_body={"guided_choice": ["red", "blue", "green"]},
)
],
),
}


Expand Down
4 changes: 4 additions & 0 deletions tests/utils/payload_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ def chat_payload(
max_tokens: int = 300,
temperature: Optional[float] = None,
stream: bool = False,
extra_body: Optional[Dict[str, Any]] = None,
) -> ChatPayload:
body: Dict[str, Any] = {
"messages": [
Expand All @@ -148,6 +149,9 @@ def chat_payload(
if temperature is not None:
body["temperature"] = temperature

if extra_body:
body.update(extra_body)

return ChatPayload(
body=body,
repeat_count=repeat_count,
Expand Down
Loading