torch-spyre · R3hankhan123 · Apr 1, 2026
@@ -8,7 +8,7 @@ This table summarize the status of features on Spyre. By default, those features
 | Automatic Prefix Caching      |   ✅   |
 | LoRA                          |   ⛔   |
 | Speculative Decoding          |   ⛔   |
-| Guided Decoding               |   ⛔   |
+| Guided Decoding               |   ✅   |
 | Enc-dec                       |   ⛔   |
 | Multi Modality                |   ⚠️   |
 | LogProbs                      |   ✅   |

@@ -79,6 +79,12 @@ override-dependencies = [
     # vllm 0.18.0 pins compressed-tensors==0.13.0; override to 0.14.0.1
     # TODO: remove once minimum vllm is bumped past 0.18.0
     "compressed-tensors==0.14.0.1",
+
+    # llguidance>=1.7.3 fixes s390x endianness issues.
+    # This conflicts with vLLM's version range (llguidance >= 1.3.0, < 1.4.0).
+    # TODO: Remove this override once vLLM's requirement range includes or moves past 1.7.3.
+    # See: https://github.com/vllm-project/vllm/blob/v0.19.1/requirements/common.txt#L22
+    "llguidance>=1.7.3"
 ]
 # This adds constraints to all dependent build environments, which will ensure everything is built
 # with the same version of torch. This CANNOT conflict with a package's existing build dependencies

@@ -0,0 +1,355 @@
+"""End-to-end tests for structured output decoding.
+
+Tests structured output support across different backends (guidance, xgrammar, outlines)
+and ensures that prompts without structured output requests don't accidentally have them applied.
+"""
+
+import json
+import pytest
+import re
+from llm_cache import LLM_CACHE
+from spyre_util import ModelInfo, patch_environment
+from vllm import SamplingParams, LLM
+from vllm.sampling_params import StructuredOutputsParams
+from vllm.config import StructuredOutputsConfig
+
+pytestmark = [pytest.mark.chunked_prefill]
+
+
+# Parametrize all tests over the three structured output backends
+# Note: Backend support varies by feature:
+# - guidance: supports json_object, json (schema), regex, choice
+# - xgrammar: supports json_object, json (schema), regex, choice
+# - outlines: supports json (schema), regex, choice (NOT json_object)
+STRUCTURED_OUTPUT_BACKENDS = ["guidance", "xgrammar", "outlines"]
+
+# Backends that support json_object (free-form JSON without schema)
+JSON_OBJECT_BACKENDS = ["guidance", "xgrammar"]  # outlines requires schema
+
+
+def _get_llm_with_structured_outputs(
+    model: ModelInfo,
+    backend: str,
+    monkeypatch,
+    max_model_len: int,
+    max_num_seqs: int,
+    max_num_batched_tokens: int,
+    structured_output_backend: str,
+) -> LLM:
+    """Helper to create LLM with structured outputs config."""
+    # Clear cache and patch environment
+    LLM_CACHE.clear()
+    patch_environment(
+        backend,
+        monkeypatch,
+        max_num_batched_tokens=max_num_batched_tokens,
+    )
+
+    # Create LLM with structured outputs config
+    if isinstance(model, ModelInfo):
+        model_name = model.name
+        revision = model.revision
+    else:
+        model_name = model
+        revision = None
+
+    return LLM(
+        model=model_name,
+        tokenizer=model_name,
+        revision=revision,
+        tokenizer_revision=revision,
+        max_model_len=max_model_len,
+        max_num_seqs=max_num_seqs,
+        tensor_parallel_size=1,
+        max_num_batched_tokens=max_num_batched_tokens,
+        enable_prefix_caching=True,
+        structured_outputs_config=StructuredOutputsConfig(backend=structured_output_backend),
+    )
+
+
+@pytest.mark.parametrize("structured_output_backend", JSON_OBJECT_BACKENDS)
+def test_structured_output_json_object(
+    model: ModelInfo,
+    backend,
+    monkeypatch,
+    max_model_len,
+    max_num_seqs,
+    max_num_batched_tokens,
+    use_llm_cache,
+    structured_output_backend: str,
+):
+    """Test that structured output with json_object=True produces valid JSON."""
+    spyre_model = _get_llm_with_structured_outputs(
+        model,
+        backend,
+        monkeypatch,
+        max_model_len,
+        max_num_seqs,
+        max_num_batched_tokens,
+        structured_output_backend,
+    )
+
+    prompt = "Generate a JSON object with name and age fields for a person."
+
+    params = SamplingParams(
+        temperature=0.0,
+        max_tokens=200,  # Increased to ensure JSON completion
+        structured_outputs=StructuredOutputsParams(json_object=True),
+    )
+
+    outputs = spyre_model.generate([prompt], [params])
+    output_text = outputs[0].outputs[0].text
+
+    # Verify output is valid JSON
+    try:
+        json_obj = json.loads(output_text)
+        assert isinstance(json_obj, dict), "Output should be a JSON object"
+    except json.JSONDecodeError as e:
+        pytest.fail(f"Output is not valid JSON: {output_text}\nError: {e}")
+
+
+@pytest.mark.parametrize("structured_output_backend", STRUCTURED_OUTPUT_BACKENDS)
+def test_structured_output_json_schema(
+    model: ModelInfo,
+    backend,
+    monkeypatch,
+    max_model_len,
+    max_num_seqs,
+    max_num_batched_tokens,
+    use_llm_cache,
+    structured_output_backend: str,
+):
+    """Test that structured output with a JSON schema validates correctly."""
+    spyre_model = _get_llm_with_structured_outputs(
+        model,
+        backend,
+        monkeypatch,
+        max_model_len,
+        max_num_seqs,
+        max_num_batched_tokens,
+        structured_output_backend,
+    )
+
+    schema = {
+        "type": "object",
+        "properties": {
+            "name": {"type": "string"},
+            "age": {"type": "integer"},
+        },
+        "required": ["name", "age"],
+        "additionalProperties": False,
+    }
+
+    prompt = "Generate a person with name and age only."
+
+    params = SamplingParams(
+        temperature=0.0,
+        max_tokens=200,  # Increased to ensure JSON completion
+        structured_outputs=StructuredOutputsParams(json=schema),
+    )
+
+    outputs = spyre_model.generate([prompt], [params])
+    output_text = outputs[0].outputs[0].text
+
+    # Verify output is valid JSON matching the schema
+    try:
+        json_obj = json.loads(output_text)
+        assert isinstance(json_obj, dict), "Output should be a JSON object"
+        assert "name" in json_obj, "Output should have 'name' field"
+        assert "age" in json_obj, "Output should have 'age' field"
+        assert isinstance(json_obj["name"], str), "'name' should be a string"
+        assert isinstance(json_obj["age"], int), "'age' should be an integer"
+    except json.JSONDecodeError as e:
+        pytest.fail(f"Output is not valid JSON: {output_text}\nError: {e}")
+
+
+@pytest.mark.parametrize("structured_output_backend", STRUCTURED_OUTPUT_BACKENDS)
+def test_structured_output_regex(
+    model: ModelInfo,
+    backend,
+    monkeypatch,
+    max_model_len,
+    max_num_seqs,
+    max_num_batched_tokens,
+    use_llm_cache,
+    structured_output_backend: str,
+):
+    """Test that structured output with regex pattern is enforced."""
+    spyre_model = _get_llm_with_structured_outputs(
+        model,
+        backend,
+        monkeypatch,
+        max_model_len,
+        max_num_seqs,
+        max_num_batched_tokens,
+        structured_output_backend,
+    )
+
+    # Regex for phone number format: XXX-XXX-XXXX
+    phone_regex = r"\d{3}-\d{3}-\d{4}"
+
+    prompt = "Generate a phone number in XXX-XXX-XXXX format."
+
+    params = SamplingParams(
+        temperature=0.0,
+        max_tokens=20,
+        structured_outputs=StructuredOutputsParams(regex=phone_regex),
+    )
+
+    outputs = spyre_model.generate([prompt], [params])
+    output_text = outputs[0].outputs[0].text.strip()
+
+    # Verify output matches the regex pattern
+    match = re.fullmatch(phone_regex, output_text)
+    assert match is not None, f"Output '{output_text}' does not match regex pattern '{phone_regex}'"
+
+
+@pytest.mark.parametrize("structured_output_backend", JSON_OBJECT_BACKENDS)
+def test_structured_output_mixed_batch(
+    model: ModelInfo,
+    backend,
+    monkeypatch,
+    max_model_len,
+    max_num_seqs,
+    max_num_batched_tokens,
+    use_llm_cache,
+    structured_output_backend: str,
+):
+    """Test that requests with and without structured outputs can coexist.
+
+    This is critical to ensure that prompts without structured output requests don't
+    accidentally have structured outputs applied. Due to chunked prefill constraints,
+    we submit requests sequentially.
+    """
+    spyre_model = _get_llm_with_structured_outputs(
+        model,
+        backend,
+        monkeypatch,
+        max_model_len,
+        max_num_seqs,
+        max_num_batched_tokens,
+        structured_output_backend,
+    )
+
+    # Request with structured output (JSON object)
+    prompt_structured = "Generate a JSON object with name and age."
+    params_structured = SamplingParams(
+        temperature=0.0,
+        max_tokens=200,  # Increased to ensure JSON completion
+        structured_outputs=StructuredOutputsParams(json_object=True),
+    )
+
+    # Request without structured output (free-form text)
+    prompt_freeform = "Write a short story about a cat."
+    params_freeform = SamplingParams(
+        temperature=0.0,
+        max_tokens=50,
+    )
+
+    # Generate with structured output
+    output_structured = spyre_model.generate([prompt_structured], [params_structured])[0]
+    output_structured_text = output_structured.outputs[0].text
+
+    # Verify structured output is valid JSON
+    try:
+        json_obj = json.loads(output_structured_text)
+        assert isinstance(json_obj, dict), "Structured output should be a JSON object"
+    except json.JSONDecodeError as e:
+        pytest.fail(f"Structured output is not valid JSON: {output_structured_text}\nError: {e}")
+
+    # Generate without structured output
+    output_freeform = spyre_model.generate([prompt_freeform], [params_freeform])[0]
+    output_freeform_text = output_freeform.outputs[0].text
+
+    # Verify freeform output is not constrained (just has text)
+    assert len(output_freeform_text) > 0, "Freeform output should have text"
+    # Don't enforce JSON structure - it should be free-form story text
+
+
+@pytest.mark.parametrize("structured_output_backend", STRUCTURED_OUTPUT_BACKENDS)
+def test_structured_output_choice(
+    model: ModelInfo,
+    backend,
+    monkeypatch,
+    max_model_len,
+    max_num_seqs,
+    max_num_batched_tokens,
+    use_llm_cache,
+    structured_output_backend: str,
+):
+    """Test that structured output with choice constraint works correctly."""
+    spyre_model = _get_llm_with_structured_outputs(
+        model,
+        backend,
+        monkeypatch,
+        max_model_len,
+        max_num_seqs,
+        max_num_batched_tokens,
+        structured_output_backend,
+    )
+
+    choices = ["yes", "no", "maybe"]
+
+    prompt = "Is the sky blue? Answer with yes, no, or maybe."
+
+    params = SamplingParams(
+        temperature=0.0,
+        max_tokens=10,
+        structured_outputs=StructuredOutputsParams(choice=choices),
+    )
+
+    outputs = spyre_model.generate([prompt], [params])
+    output_text = outputs[0].outputs[0].text.strip().lower()
+
+    # Verify output is one of the allowed choices
+    assert output_text in choices, f"Output '{output_text}' not in allowed choices {choices}"
+
+
+@pytest.mark.parametrize("dynamo_backend", ["sendnn", "eager"])
+@pytest.mark.parametrize("structured_output_backend", JSON_OBJECT_BACKENDS)
+def test_structured_output_all_backends(
+    model: ModelInfo,
+    dynamo_backend,
+    monkeypatch,
+    max_model_len,
+    max_num_seqs,
+    max_num_batched_tokens,
+    use_llm_cache,
+    structured_output_backend: str,
+):
+    """Test that structured outputs work across all dynamo backends (sendnn/eager)."""
+    spyre_model = _get_llm_with_structured_outputs(
+        model,
+        dynamo_backend,
+        monkeypatch,
+        max_model_len,
+        max_num_seqs,
+        max_num_batched_tokens,
+        structured_output_backend,
+    )
+
+    prompt = "Generate a JSON object with a single field 'answer' set to 42."
+
+    params = SamplingParams(
+        temperature=0.0,
+        max_tokens=200,  # Increased to ensure JSON completion
+        structured_outputs=StructuredOutputsParams(json_object=True),
+    )
+
+    outputs = spyre_model.generate([prompt], [params])
+    output_text = outputs[0].outputs[0].text
+
+    # Verify output is valid JSON regardless of backend
+    try:
+        json_obj = json.loads(output_text)
+        assert isinstance(json_obj, dict), (
+            f"Output should be a JSON object on {dynamo_backend} with {structured_output_backend}"
+        )
+    except json.JSONDecodeError as e:
+        pytest.fail(
+            f"Output is not valid JSON on {dynamo_backend} backend "
+            f"with {structured_output_backend}: {output_text}\nError: {e}"
+        )
+
+
+# Made with Bob