Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/user_guide/supported_features.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ This table summarize the status of features on Spyre. By default, those features
| Automatic Prefix Caching | ✅ |
| LoRA | ⛔ |
| Speculative Decoding | ⛔ |
| Guided Decoding | |
| Guided Decoding | |
| Enc-dec | ⛔ |
| Multi Modality | ⚠️ |
| LogProbs | ✅ |
Expand Down
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,12 @@ override-dependencies = [
# vllm 0.18.0 pins compressed-tensors==0.13.0; override to 0.14.0.1
# TODO: remove once minimum vllm is bumped past 0.18.0
"compressed-tensors==0.14.0.1",

# llguidance>=1.7.3 fixes s390x endianness issues.
# This conflicts with vLLM's version range (llguidance >= 1.3.0, < 1.4.0).
# TODO: Remove this override once vLLM's requirement range includes or moves past 1.7.3.
# See: https://github.com/vllm-project/vllm/blob/v0.19.1/requirements/common.txt#L22
"llguidance>=1.7.3"
]
# This adds constraints to all dependent build environments, which will ensure everything is built
# with the same version of torch. This CANNOT conflict with a package's existing build dependencies
Expand Down
355 changes: 355 additions & 0 deletions tests/e2e/test_structured_outputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,355 @@
"""End-to-end tests for structured output decoding.

Tests structured output support across different backends (guidance, xgrammar, outlines)
and ensures that prompts without structured output requests don't accidentally have them applied.
"""

import json
import pytest
import re
from llm_cache import LLM_CACHE
from spyre_util import ModelInfo, patch_environment
from vllm import SamplingParams, LLM
from vllm.sampling_params import StructuredOutputsParams
from vllm.config import StructuredOutputsConfig

pytestmark = [pytest.mark.chunked_prefill]


# Parametrize all tests over the three structured output backends
# Note: Backend support varies by feature:
# - guidance: supports json_object, json (schema), regex, choice
# - xgrammar: supports json_object, json (schema), regex, choice
# - outlines: supports json (schema), regex, choice (NOT json_object)
STRUCTURED_OUTPUT_BACKENDS = ["guidance", "xgrammar", "outlines"]

# Backends that support json_object (free-form JSON without schema)
JSON_OBJECT_BACKENDS = ["guidance", "xgrammar"] # outlines requires schema


def _get_llm_with_structured_outputs(
model: ModelInfo,
backend: str,
monkeypatch,
max_model_len: int,
max_num_seqs: int,
max_num_batched_tokens: int,
structured_output_backend: str,
) -> LLM:
"""Helper to create LLM with structured outputs config."""
# Clear cache and patch environment
LLM_CACHE.clear()
patch_environment(
backend,
monkeypatch,
max_num_batched_tokens=max_num_batched_tokens,
)

# Create LLM with structured outputs config
if isinstance(model, ModelInfo):
model_name = model.name
revision = model.revision
else:
model_name = model
revision = None

return LLM(
model=model_name,
tokenizer=model_name,
revision=revision,
tokenizer_revision=revision,
max_model_len=max_model_len,
max_num_seqs=max_num_seqs,
tensor_parallel_size=1,
max_num_batched_tokens=max_num_batched_tokens,
enable_prefix_caching=True,
structured_outputs_config=StructuredOutputsConfig(backend=structured_output_backend),
)


@pytest.mark.parametrize("structured_output_backend", JSON_OBJECT_BACKENDS)
def test_structured_output_json_object(
model: ModelInfo,
backend,
monkeypatch,
max_model_len,
max_num_seqs,
max_num_batched_tokens,
use_llm_cache,
structured_output_backend: str,
):
"""Test that structured output with json_object=True produces valid JSON."""
spyre_model = _get_llm_with_structured_outputs(
model,
backend,
monkeypatch,
max_model_len,
max_num_seqs,
max_num_batched_tokens,
structured_output_backend,
)

prompt = "Generate a JSON object with name and age fields for a person."

params = SamplingParams(
temperature=0.0,
max_tokens=200, # Increased to ensure JSON completion
structured_outputs=StructuredOutputsParams(json_object=True),
)

outputs = spyre_model.generate([prompt], [params])
output_text = outputs[0].outputs[0].text

# Verify output is valid JSON
try:
json_obj = json.loads(output_text)
assert isinstance(json_obj, dict), "Output should be a JSON object"
except json.JSONDecodeError as e:
pytest.fail(f"Output is not valid JSON: {output_text}\nError: {e}")


@pytest.mark.parametrize("structured_output_backend", STRUCTURED_OUTPUT_BACKENDS)
def test_structured_output_json_schema(
model: ModelInfo,
backend,
monkeypatch,
max_model_len,
max_num_seqs,
max_num_batched_tokens,
use_llm_cache,
structured_output_backend: str,
):
"""Test that structured output with a JSON schema validates correctly."""
spyre_model = _get_llm_with_structured_outputs(
model,
backend,
monkeypatch,
max_model_len,
max_num_seqs,
max_num_batched_tokens,
structured_output_backend,
)

schema = {
"type": "object",
"properties": {
"name": {"type": "string"},
"age": {"type": "integer"},
},
"required": ["name", "age"],
"additionalProperties": False,
}

prompt = "Generate a person with name and age only."

params = SamplingParams(
temperature=0.0,
max_tokens=200, # Increased to ensure JSON completion
structured_outputs=StructuredOutputsParams(json=schema),
)

outputs = spyre_model.generate([prompt], [params])
output_text = outputs[0].outputs[0].text

# Verify output is valid JSON matching the schema
try:
json_obj = json.loads(output_text)
assert isinstance(json_obj, dict), "Output should be a JSON object"
assert "name" in json_obj, "Output should have 'name' field"
assert "age" in json_obj, "Output should have 'age' field"
assert isinstance(json_obj["name"], str), "'name' should be a string"
assert isinstance(json_obj["age"], int), "'age' should be an integer"
except json.JSONDecodeError as e:
pytest.fail(f"Output is not valid JSON: {output_text}\nError: {e}")


@pytest.mark.parametrize("structured_output_backend", STRUCTURED_OUTPUT_BACKENDS)
def test_structured_output_regex(
model: ModelInfo,
backend,
monkeypatch,
max_model_len,
max_num_seqs,
max_num_batched_tokens,
use_llm_cache,
structured_output_backend: str,
):
"""Test that structured output with regex pattern is enforced."""
spyre_model = _get_llm_with_structured_outputs(
model,
backend,
monkeypatch,
max_model_len,
max_num_seqs,
max_num_batched_tokens,
structured_output_backend,
)

# Regex for phone number format: XXX-XXX-XXXX
phone_regex = r"\d{3}-\d{3}-\d{4}"

prompt = "Generate a phone number in XXX-XXX-XXXX format."

params = SamplingParams(
temperature=0.0,
max_tokens=20,
structured_outputs=StructuredOutputsParams(regex=phone_regex),
)

outputs = spyre_model.generate([prompt], [params])
output_text = outputs[0].outputs[0].text.strip()

# Verify output matches the regex pattern
match = re.fullmatch(phone_regex, output_text)
assert match is not None, f"Output '{output_text}' does not match regex pattern '{phone_regex}'"


@pytest.mark.parametrize("structured_output_backend", JSON_OBJECT_BACKENDS)
def test_structured_output_mixed_batch(
model: ModelInfo,
backend,
monkeypatch,
max_model_len,
max_num_seqs,
max_num_batched_tokens,
use_llm_cache,
structured_output_backend: str,
):
"""Test that requests with and without structured outputs can coexist.

This is critical to ensure that prompts without structured output requests don't
accidentally have structured outputs applied. Due to chunked prefill constraints,
we submit requests sequentially.
"""
spyre_model = _get_llm_with_structured_outputs(
model,
backend,
monkeypatch,
max_model_len,
max_num_seqs,
max_num_batched_tokens,
structured_output_backend,
)

# Request with structured output (JSON object)
prompt_structured = "Generate a JSON object with name and age."
params_structured = SamplingParams(
temperature=0.0,
max_tokens=200, # Increased to ensure JSON completion
structured_outputs=StructuredOutputsParams(json_object=True),
)

# Request without structured output (free-form text)
prompt_freeform = "Write a short story about a cat."
params_freeform = SamplingParams(
temperature=0.0,
max_tokens=50,
)

# Generate with structured output
output_structured = spyre_model.generate([prompt_structured], [params_structured])[0]
output_structured_text = output_structured.outputs[0].text

# Verify structured output is valid JSON
try:
json_obj = json.loads(output_structured_text)
assert isinstance(json_obj, dict), "Structured output should be a JSON object"
except json.JSONDecodeError as e:
pytest.fail(f"Structured output is not valid JSON: {output_structured_text}\nError: {e}")

# Generate without structured output
output_freeform = spyre_model.generate([prompt_freeform], [params_freeform])[0]
output_freeform_text = output_freeform.outputs[0].text

# Verify freeform output is not constrained (just has text)
assert len(output_freeform_text) > 0, "Freeform output should have text"
# Don't enforce JSON structure - it should be free-form story text


@pytest.mark.parametrize("structured_output_backend", STRUCTURED_OUTPUT_BACKENDS)
def test_structured_output_choice(
model: ModelInfo,
backend,
monkeypatch,
max_model_len,
max_num_seqs,
max_num_batched_tokens,
use_llm_cache,
structured_output_backend: str,
):
"""Test that structured output with choice constraint works correctly."""
spyre_model = _get_llm_with_structured_outputs(
model,
backend,
monkeypatch,
max_model_len,
max_num_seqs,
max_num_batched_tokens,
structured_output_backend,
)

choices = ["yes", "no", "maybe"]

prompt = "Is the sky blue? Answer with yes, no, or maybe."

params = SamplingParams(
temperature=0.0,
max_tokens=10,
structured_outputs=StructuredOutputsParams(choice=choices),
)

outputs = spyre_model.generate([prompt], [params])
output_text = outputs[0].outputs[0].text.strip().lower()

# Verify output is one of the allowed choices
assert output_text in choices, f"Output '{output_text}' not in allowed choices {choices}"


@pytest.mark.parametrize("dynamo_backend", ["sendnn", "eager"])
@pytest.mark.parametrize("structured_output_backend", JSON_OBJECT_BACKENDS)
def test_structured_output_all_backends(
model: ModelInfo,
dynamo_backend,
monkeypatch,
max_model_len,
max_num_seqs,
max_num_batched_tokens,
use_llm_cache,
structured_output_backend: str,
):
"""Test that structured outputs work across all dynamo backends (sendnn/eager)."""
spyre_model = _get_llm_with_structured_outputs(
model,
dynamo_backend,
monkeypatch,
max_model_len,
max_num_seqs,
max_num_batched_tokens,
structured_output_backend,
)

prompt = "Generate a JSON object with a single field 'answer' set to 42."

params = SamplingParams(
temperature=0.0,
max_tokens=200, # Increased to ensure JSON completion
structured_outputs=StructuredOutputsParams(json_object=True),
)

outputs = spyre_model.generate([prompt], [params])
output_text = outputs[0].outputs[0].text

# Verify output is valid JSON regardless of backend
try:
json_obj = json.loads(output_text)
assert isinstance(json_obj, dict), (
f"Output should be a JSON object on {dynamo_backend} with {structured_output_backend}"
)
except json.JSONDecodeError as e:
pytest.fail(
f"Output is not valid JSON on {dynamo_backend} backend "
f"with {structured_output_backend}: {output_text}\nError: {e}"
)


# Made with Bob
Loading
Loading