diff --git a/docs/user_guide/supported_features.md b/docs/user_guide/supported_features.md
index f6659462d..ca0572120 100644
--- a/docs/user_guide/supported_features.md
+++ b/docs/user_guide/supported_features.md
@@ -8,7 +8,7 @@ This table summarize the status of features on Spyre. By default, those features
 | Automatic Prefix Caching      |   ✅   |
 | LoRA                          |   ⛔   |
 | Speculative Decoding          |   ⛔   |
-| Guided Decoding               |   ⛔   |
+| Guided Decoding               |   ✅   |
 | Enc-dec                       |   ⛔   |
 | Multi Modality                |   ⚠️   |
 | LogProbs                      |   ✅   |
diff --git a/pyproject.toml b/pyproject.toml
index 1ef85f073..357367620 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -79,6 +79,12 @@ override-dependencies = [
     # vllm 0.18.0 pins compressed-tensors==0.13.0; override to 0.14.0.1
     # TODO: remove once minimum vllm is bumped past 0.18.0
     "compressed-tensors==0.14.0.1",
+
+    # llguidance>=1.7.3 fixes s390x endianness issues.
+    # This conflicts with vLLM's version range (llguidance >= 1.3.0, < 1.4.0).
+    # TODO: Remove this override once vLLM's requirement range includes or moves past 1.7.3.
+    # See: https://github.com/vllm-project/vllm/blob/v0.19.1/requirements/common.txt#L22
+    "llguidance>=1.7.3"
 ]
 # This adds constraints to all dependent build environments, which will ensure everything is built
 # with the same version of torch. This CANNOT conflict with a package's existing build dependencies
diff --git a/sendnn_inference/platform.py b/sendnn_inference/platform.py
index dbb24c5f5..16a6e2903 100644
--- a/sendnn_inference/platform.py
+++ b/sendnn_inference/platform.py
@@ -450,15 +450,6 @@ def validate_request(
         if params.prompt_logprobs is not None:
             raise ValueError("Prompt logprobs are currently not supported.")
 
-        # Structured Outputs are not supported yet and cause issues in our
-        # scheduler if included in the request
-        if params.structured_outputs is not None:
-            logger.warning(
-                "Structured outputs are currently not supported and "
-                "will be stripped from the request."
-            )
-            params.structured_outputs = None
-
         if "encoder_prompt" in processed_inputs:
             raise ValueError("Encoder-decoder models not supported ")
         if "prompt_token_ids" not in processed_inputs:
diff --git a/sendnn_inference/v1/core/scheduler.py b/sendnn_inference/v1/core/scheduler.py
index 61b0a8d13..fd88ba15d 100644
--- a/sendnn_inference/v1/core/scheduler.py
+++ b/sendnn_inference/v1/core/scheduler.py
@@ -116,6 +116,7 @@ def schedule(self) -> SchedulerOutput:
         while holdback_queue:
             self.waiting.append(holdback_queue.popleft())
 
+        outputs._spyre_grammar_output = self.get_grammar_bitmask(outputs)  # type: ignore[attr-defined]
         return outputs
 
     def _get_matching_warmup_shapes(
@@ -260,22 +261,6 @@ def schedule(self) -> "SchedulerOutput":
         while holdback_queue:
             if self.can_schedule_prefill(holdback_queue[0]):
                 new_request = holdback_queue.popleft()
-                # Remove structured_output_request
-                # NB: SpyrePlatform.validate_request() removes structured_output
-                # before the request gets here in most cases
-                # TODO: We don't currently support structured output and it
-                # breaks some assumptions the code makes. The problems is that
-                # a structured output request will stay in waiting for multiple
-                # iterations with status WAITING_FOR_FSM. To handle this
-                # properly we need to exclude such requests from entering
-                # ongoing_prefills but still pass them in the waiting queue to
-                # the base scheduler to track the FSM initialization.
-                if new_request.structured_output_request is not None:
-                    logger.warning(
-                        "Removing structured output from request: %s", new_request.request_id
-                    )
-                    new_request.structured_output_request = None
-                    new_request.status = RequestStatus.WAITING
 
                 logger.debug(
                     "Scheduling a new request (%d prompt tokens), holding back %d requests",
@@ -300,6 +285,8 @@ def schedule(self) -> "SchedulerOutput":
             "Ongoing prefill requests must be in the running queue."
         )
 
+        new_prefill_candidates: list[Request] = []
+
         # Check ongoing prefills
         if self.ongoing_prefills:
             # Some running requests are currently being prefilled. We need to
@@ -322,11 +309,25 @@ def schedule(self) -> "SchedulerOutput":
 
         # Check new requests to prefill
         elif len(self.waiting) > 0:
-            self.ongoing_prefills.extend(self.waiting)
-            # Hide current decodes from the scheduler
-            running_holdback = self.running
-            self.running = []
-            self.previous_step_was_prefill = True
+            # Separate requests that are ready to prefill from those waiting
+            # for grammar FSM initialization (WAITING_FOR_STRUCTURED_OUTPUT_GRAMMAR). Only
+            # ready requests should hide the decode batch.
+            ready_to_prefill = [
+                r
+                for r in self.waiting
+                if r.status != RequestStatus.WAITING_FOR_STRUCTURED_OUTPUT_GRAMMAR  # type: ignore[attr-defined]
+            ]
+            if ready_to_prefill:
+                new_prefill_candidates = list(self.waiting)
+                # Hide current decodes from the scheduler
+                running_holdback = self.running
+                self.running = []
+                self.previous_step_was_prefill = True
+            else:
+                # Only WAITING_FOR_STRUCTURED_OUTPUT_GRAMMAR requests — let decodes continue and
+                # pass FSM-waiting requests through to the base scheduler.
+                running_holdback = []
+                self.previous_step_was_prefill = False
         else:
             self.previous_step_was_prefill = False
             running_holdback = []
@@ -334,6 +335,13 @@ def schedule(self) -> "SchedulerOutput":
         # delegate to super of SpyreScheduler: base V1 Scheduler
         outputs = super(SpyreScheduler, self).schedule()
 
+        # Track as ongoing prefills only the requests that were actually
+        # scheduled (i.e., moved from waiting to running by the base
+        # scheduler).  Structured output requests in WAITING_FOR_STRUCTURED_OUTPUT_GRAMMAR status
+        # are kept in self.waiting until their grammar FSM is ready.
+        if new_prefill_candidates:
+            self.ongoing_prefills.extend(r for r in new_prefill_candidates if r in self.running)
+
         # restore holdbacks after running the base scheduler
         self.running = self.running + running_holdback
         while holdback_queue:
@@ -345,6 +353,15 @@ def schedule(self) -> "SchedulerOutput":
             r.num_computed_tokens <= r.num_prompt_tokens + 1 for r in self.running
         ):
             logger.debug("Scheduled tokens in this step: %s", outputs.num_scheduled_tokens)
+
+        # Collect grammar bitmask synchronously for structured outputs.
+        # NOTE: This is done here because vllm-spyre currently combines token sampling
+        # in model_executor.execute_model() rather than implementing sample_tokens()
+        # in the model runner. This means we cannot collect the grammar bitmask
+        # asynchronously while the model is running (as done in vLLM core).
+        # TODO: Implement sample_tokens() in SpyreModelRunner to enable async grammar
+        # collection for better performance.
+        outputs._spyre_grammar_output = self.get_grammar_bitmask(outputs)  # type: ignore[attr-defined]
         return outputs
 
     def can_schedule_prefill(self, request: Request) -> bool:
diff --git a/sendnn_inference/v1/worker/spyre_model_runner.py b/sendnn_inference/v1/worker/spyre_model_runner.py
index 50d8f191b..17781328b 100644
--- a/sendnn_inference/v1/worker/spyre_model_runner.py
+++ b/sendnn_inference/v1/worker/spyre_model_runner.py
@@ -23,6 +23,9 @@
 from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, ModelRunnerOutput, SamplerOutput
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.request import Request
+from vllm.v1.structured_output.utils import (
+    apply_grammar_bitmask as vllm_apply_grammar_bitmask,
+)
 
 import sendnn_inference.envs as envs_spyre
 import sendnn_inference.utils as utils_spyre
@@ -1502,6 +1505,26 @@ def is_cached_chunk(self, scheduler_output: SchedulerOutput):
             return True
         return False
 
+    def apply_grammar_bitmask(
+        self,
+        scheduler_output: "SchedulerOutput",
+        logits: torch.Tensor,
+        batch: SamplingInputBatch,
+    ) -> None:
+        """Apply grammar bitmask in-place to constrain logits for structured
+        output requests.
+        """
+        grammar_output = getattr(scheduler_output, "_spyre_grammar_output", None)
+        if grammar_output is None:
+            return
+
+        vllm_apply_grammar_bitmask(
+            scheduler_output,
+            grammar_output,
+            batch,  # type: ignore[arg-type]
+            logits,
+        )
+
     @SpyrePlatform.inference_mode()
     def execute_model(
         self,
@@ -1558,6 +1581,13 @@ def execute_model(
             logger.debug("t_forward_pass: %.2fms [prefill single chunk][batch size 1]", (t1 * 1000))
             return self.prefill_output()
 
+        # Apply grammar bitmask for structured output requests.
+        self.apply_grammar_bitmask(
+            scheduler_output,
+            logits,
+            self.prefill_batch if is_prefill else self.input_batch,
+        )
+
         # Sample the next token.
         output: SamplerOutput | None = self.model.sample(
             logits=logits,
diff --git a/tests/e2e/test_structured_outputs.py b/tests/e2e/test_structured_outputs.py
new file mode 100644
index 000000000..cde1f32d8
--- /dev/null
+++ b/tests/e2e/test_structured_outputs.py
@@ -0,0 +1,278 @@
+"""End-to-end tests for structured output decoding.
+
+Tests structured output support across different backends (guidance, xgrammar, outlines)
+and ensures that prompts without structured output requests don't accidentally have them applied.
+"""
+
+import json
+import pytest
+import re
+from llm_cache import get_llm
+from spyre_util import ModelInfo
+from vllm import SamplingParams
+from vllm.sampling_params import StructuredOutputsParams
+from vllm.config import StructuredOutputsConfig
+
+pytestmark = [pytest.mark.chunked_prefill]
+
+
+# Parametrize all tests over the three structured output backends
+# Note: Backend support varies by feature:
+# - guidance: supports json_object, json (schema), regex, choice
+# - xgrammar: supports json_object, json (schema), regex, choice
+# - outlines: supports json (schema), regex, choice (NOT json_object)
+STRUCTURED_OUTPUT_BACKENDS = ["guidance", "xgrammar", "outlines"]
+
+# Backends that support json_object (free-form JSON without schema)
+JSON_OBJECT_BACKENDS = ["guidance", "xgrammar"]  # outlines requires schema
+
+
+@pytest.mark.parametrize("structured_output_backend", JSON_OBJECT_BACKENDS)
+def test_structured_output_json_object(
+    model: ModelInfo,
+    backend,
+    monkeypatch,
+    max_model_len,
+    max_num_seqs,
+    max_num_batched_tokens,
+    use_llm_cache,
+    structured_output_backend: str,
+):
+    """Test that structured output with json_object=True produces valid JSON."""
+    spyre_model = get_llm(
+        model=model,
+        max_model_len=max_model_len,
+        backend=backend,
+        monkeypatch=monkeypatch,
+        max_num_seqs=max_num_seqs,
+        max_num_batched_tokens=max_num_batched_tokens,
+        use_pc=True,
+        cached=True,
+        structured_outputs_config=StructuredOutputsConfig(backend=structured_output_backend),
+    )
+
+    prompt = "Generate a JSON object with name and age fields for a person."
+
+    params = SamplingParams(
+        temperature=0.0,
+        max_tokens=50,
+        structured_outputs=StructuredOutputsParams(json_object=True),
+    )
+
+    outputs = spyre_model.generate([prompt], [params])
+    output_text = outputs[0].outputs[0].text
+
+    # Verify output is valid JSON
+    try:
+        json_obj = json.loads(output_text)
+        assert isinstance(json_obj, dict), "Output should be a JSON object"
+    except json.JSONDecodeError as e:
+        pytest.fail(f"Output is not valid JSON: {output_text}\nError: {e}")
+
+
+@pytest.mark.parametrize("structured_output_backend", STRUCTURED_OUTPUT_BACKENDS)
+def test_structured_output_json_schema(
+    model: ModelInfo,
+    backend,
+    monkeypatch,
+    max_model_len,
+    max_num_seqs,
+    max_num_batched_tokens,
+    use_llm_cache,
+    structured_output_backend: str,
+):
+    """Test that structured output with a JSON schema validates correctly."""
+    spyre_model = get_llm(
+        model=model,
+        max_model_len=max_model_len,
+        backend=backend,
+        monkeypatch=monkeypatch,
+        max_num_seqs=max_num_seqs,
+        max_num_batched_tokens=max_num_batched_tokens,
+        use_pc=True,
+        cached=True,
+        structured_outputs_config=StructuredOutputsConfig(backend=structured_output_backend),
+    )
+
+    schema = {
+        "type": "object",
+        "properties": {
+            "name": {"type": "string"},
+            "age": {"type": "integer"},
+        },
+        "required": ["name", "age"],
+        "additionalProperties": False,
+    }
+
+    prompt = "Generate a person with name and age only."
+
+    params = SamplingParams(
+        temperature=0.0,
+        max_tokens=50,
+        structured_outputs=StructuredOutputsParams(json=schema),
+    )
+
+    outputs = spyre_model.generate([prompt], [params])
+    output_text = outputs[0].outputs[0].text
+
+    # Verify output is valid JSON matching the schema
+    try:
+        json_obj = json.loads(output_text)
+        assert isinstance(json_obj, dict), "Output should be a JSON object"
+        assert "name" in json_obj, "Output should have 'name' field"
+        assert "age" in json_obj, "Output should have 'age' field"
+        assert isinstance(json_obj["name"], str), "'name' should be a string"
+        assert isinstance(json_obj["age"], int), "'age' should be an integer"
+    except json.JSONDecodeError as e:
+        pytest.fail(f"Output is not valid JSON: {output_text}\nError: {e}")
+
+
+@pytest.mark.parametrize("structured_output_backend", STRUCTURED_OUTPUT_BACKENDS)
+def test_structured_output_regex(
+    model: ModelInfo,
+    backend,
+    monkeypatch,
+    max_model_len,
+    max_num_seqs,
+    max_num_batched_tokens,
+    use_llm_cache,
+    structured_output_backend: str,
+):
+    """Test that structured output with regex pattern is enforced."""
+    spyre_model = get_llm(
+        model=model,
+        max_model_len=max_model_len,
+        backend=backend,
+        monkeypatch=monkeypatch,
+        max_num_seqs=max_num_seqs,
+        max_num_batched_tokens=max_num_batched_tokens,
+        use_pc=True,
+        cached=True,
+        structured_outputs_config=StructuredOutputsConfig(backend=structured_output_backend),
+    )
+
+    # Regex for phone number format: XXX-XXX-XXXX
+    phone_regex = r"\d{3}-\d{3}-\d{4}"
+
+    prompt = "Generate a phone number in XXX-XXX-XXXX format."
+
+    params = SamplingParams(
+        temperature=0.0,
+        max_tokens=20,
+        structured_outputs=StructuredOutputsParams(regex=phone_regex),
+    )
+
+    outputs = spyre_model.generate([prompt], [params])
+    output_text = outputs[0].outputs[0].text.strip()
+
+    # Verify output matches the regex pattern
+    match = re.fullmatch(phone_regex, output_text)
+    assert match is not None, f"Output '{output_text}' does not match regex pattern '{phone_regex}'"
+
+
+@pytest.mark.parametrize("structured_output_backend", JSON_OBJECT_BACKENDS)
+def test_structured_output_mixed_batch(
+    model: ModelInfo,
+    backend,
+    monkeypatch,
+    max_model_len,
+    max_num_seqs,
+    max_num_batched_tokens,
+    use_llm_cache,
+    structured_output_backend: str,
+):
+    """Test that requests with and without structured outputs can coexist.
+
+    This is critical to ensure that prompts without structured output requests don't
+    accidentally have structured outputs applied. Due to chunked prefill constraints,
+    we submit requests sequentially with the same model.
+    """
+    spyre_model = get_llm(
+        model=model,
+        max_model_len=max_model_len,
+        backend=backend,
+        monkeypatch=monkeypatch,
+        max_num_seqs=max_num_seqs,
+        max_num_batched_tokens=max_num_batched_tokens,
+        use_pc=True,
+        cached=True,
+        structured_outputs_config=StructuredOutputsConfig(backend=structured_output_backend),
+    )
+
+    # Request with structured output (JSON object)
+    prompt_structured = "Generate a JSON object with name and age."
+    params_structured = SamplingParams(
+        temperature=0.0,
+        max_tokens=50,
+        structured_outputs=StructuredOutputsParams(json_object=True),
+    )
+
+    # Request without structured output (free-form text)
+    prompt_freeform = "Write a short story about a cat."
+    params_freeform = SamplingParams(
+        temperature=0.0,
+        max_tokens=50,
+    )
+
+    # Generate with structured output first
+    output_structured = spyre_model.generate([prompt_structured], [params_structured])[0]
+    output_structured_text = output_structured.outputs[0].text
+
+    # Verify structured output is valid JSON
+    try:
+        json_obj = json.loads(output_structured_text)
+        assert isinstance(json_obj, dict), "Structured output should be a JSON object"
+    except json.JSONDecodeError as e:
+        pytest.fail(f"Structured output is not valid JSON: {output_structured_text}\nError: {e}")
+
+    # Generate without structured output
+    output_freeform = spyre_model.generate([prompt_freeform], [params_freeform])[0]
+    output_freeform_text = output_freeform.outputs[0].text
+
+    # Verify freeform output is not constrained (just has text)
+    assert len(output_freeform_text) > 0, "Freeform output should have text"
+    # Don't enforce JSON structure - it should be free-form story text
+
+
+@pytest.mark.parametrize("structured_output_backend", STRUCTURED_OUTPUT_BACKENDS)
+def test_structured_output_choice(
+    model: ModelInfo,
+    backend,
+    monkeypatch,
+    max_model_len,
+    max_num_seqs,
+    max_num_batched_tokens,
+    use_llm_cache,
+    structured_output_backend: str,
+):
+    """Test that structured output with choice constraint works correctly."""
+    spyre_model = get_llm(
+        model=model,
+        max_model_len=max_model_len,
+        backend=backend,
+        monkeypatch=monkeypatch,
+        max_num_seqs=max_num_seqs,
+        max_num_batched_tokens=max_num_batched_tokens,
+        use_pc=True,
+        cached=True,
+        structured_outputs_config=StructuredOutputsConfig(backend=structured_output_backend),
+    )
+
+    choices = ["yes", "no", "maybe"]
+
+    prompt = "Is the sky blue? Answer with yes, no, or maybe."
+
+    params = SamplingParams(
+        temperature=0.0,
+        max_tokens=10,
+        structured_outputs=StructuredOutputsParams(choice=choices),
+    )
+
+    outputs = spyre_model.generate([prompt], [params])
+    output_text = outputs[0].outputs[0].text.strip().lower()
+
+    # Verify output is one of the allowed choices
+    assert output_text in choices, f"Output '{output_text}' not in allowed choices {choices}"
+
+
+# Made with Bob
diff --git a/tests/llm_cache.py b/tests/llm_cache.py
index a7bf21816..806fafdfc 100644
--- a/tests/llm_cache.py
+++ b/tests/llm_cache.py
@@ -112,6 +112,7 @@ def get_cached_llm(
         max_num_seqs: int | None = None,
         use_pc: bool = False,
         max_num_batched_tokens: int | None = None,
+        structured_outputs_config=None,
     ) -> LLM:
         """Creates an LLM with the provided runtime configuration.
 
@@ -126,6 +127,8 @@ def get_cached_llm(
             "use_pc": use_pc,
             "max_num_batched_tokens": max_num_batched_tokens,
         }
+        if structured_outputs_config is not None:
+            runtime_config["structured_outputs_config"] = structured_outputs_config
         if warmup_shapes:
             runtime_config.update({"warmup_shapes": tuple(warmup_shapes)})
         else:
@@ -152,6 +155,7 @@ def get_cached_llm(
                 tensor_parallel_size=tensor_parallel_size,
                 max_num_batched_tokens=max_num_batched_tokens,
                 enable_prefix_caching=use_pc,
+                structured_outputs_config=structured_outputs_config,
             ),
         )
 
@@ -167,6 +171,7 @@ def _create_llm(
     max_num_seqs: int | None,
     max_num_batched_tokens: int | None,
     enable_prefix_caching: bool,
+    structured_outputs_config=None,
 ) -> LLM:
     if isinstance(model, ModelInfo):
         model_name = model.name
@@ -175,18 +180,22 @@ def _create_llm(
         model_name = model
         revision = None
 
-    return LLM(
-        model=model_name,
-        tokenizer=model_name,
-        revision=revision,
-        tokenizer_revision=revision,
-        max_model_len=max_model_len,
-        max_num_seqs=max_num_seqs,
-        tensor_parallel_size=tensor_parallel_size,
-        max_num_batched_tokens=max_num_batched_tokens,
-        logits_processors=[GoldenTokenInjector],
-        enable_prefix_caching=enable_prefix_caching,
-    )
+    llm_kwargs = {
+        "model": model_name,
+        "tokenizer": model_name,
+        "revision": revision,
+        "tokenizer_revision": revision,
+        "max_model_len": max_model_len,
+        "max_num_seqs": max_num_seqs,
+        "tensor_parallel_size": tensor_parallel_size,
+        "max_num_batched_tokens": max_num_batched_tokens,
+        "logits_processors": [GoldenTokenInjector],
+        "enable_prefix_caching": enable_prefix_caching,
+    }
+    if structured_outputs_config is not None:
+        llm_kwargs["structured_outputs_config"] = structured_outputs_config
+
+    return LLM(**llm_kwargs)
 
 
 class EngineCache:
@@ -407,6 +416,7 @@ def get_llm(
     max_num_batched_tokens: int | None = None,
     use_pc: bool = False,
     cached: bool = True,
+    structured_outputs_config=None,
 ) -> LLM:
     # Clear other caches first
     API_SERVER_CACHE.clear()
@@ -423,6 +433,7 @@ def get_llm(
             max_num_seqs=max_num_seqs,
             use_pc=use_pc,
             max_num_batched_tokens=max_num_batched_tokens,
+            structured_outputs_config=structured_outputs_config,
         )
 
     patch_environment(
@@ -439,6 +450,7 @@ def get_llm(
         max_num_seqs=max_num_seqs,
         max_num_batched_tokens=max_num_batched_tokens,
         enable_prefix_caching=use_pc,
+        structured_outputs_config=structured_outputs_config,
     )
 
 
diff --git a/tests/llm_cache_util.py b/tests/llm_cache_util.py
index f389f51ef..c4765f79c 100644
--- a/tests/llm_cache_util.py
+++ b/tests/llm_cache_util.py
@@ -75,6 +75,7 @@ class SortKey(NamedTuple):
     max_num_seqs: int = 0
     num_blocks: int = 0
     max_num_batched_tokens: int = 0
+    structured_output_backend: str = ""
     warmup_shapes: EmbeddingWarmupShapes | None = None
 
     @staticmethod
@@ -112,6 +113,7 @@ def from_item(item) -> "SortKey":
             use_pc=use_pc,
             num_blocks=SortKey._get_num_blocks(item),
             max_num_batched_tokens=SortKey._get_max_num_batched_tokens(item),
+            structured_output_backend=SortKey._get_structured_output_backend(item),
             **sort_kwargs,
         )
 
@@ -267,6 +269,20 @@ def _get_num_blocks(item) -> int:
         # Most tests don't use this param
         return 0
 
+    @staticmethod
+    def _get_structured_output_backend(item) -> str:
+        """Extract structured output backend from test parameters."""
+        if "structured_output_backend" in item.callspec.params:
+            backend = item.callspec.params["structured_output_backend"]
+            SortKey._assert_param(
+                isinstance(backend, str),
+                "structured_output_backend must be a string.",
+                item,
+            )
+            return backend
+        # Most tests don't use structured outputs
+        return ""
+
     @staticmethod
     def _assert_param(condition, message, item):
         assert condition, (
diff --git a/tests/utils/test_platform_validation.py b/tests/utils/test_platform_validation.py
index 8fa80509a..880593e97 100644
--- a/tests/utils/test_platform_validation.py
+++ b/tests/utils/test_platform_validation.py
@@ -31,33 +31,32 @@ def mock_spyre_config():
 
 
 class TestStructuredOutputValidation:
-    """Test that platform validation strips structured outputs from requests."""
+    """Test that platform validation passes structured outputs through unchanged."""
 
-    def test_strips_structured_outputs(self):
-        """Test that validate_request sets structured_outputs to None."""
-        params = SamplingParams(
-            max_tokens=20, structured_outputs=StructuredOutputsParams(json_object=True)
-        )
+    def test_preserves_structured_outputs(self):
+        """Test that validate_request does not strip structured_outputs."""
+        structured_outputs = StructuredOutputsParams(json_object=True)
+        params = SamplingParams(max_tokens=20, structured_outputs=structured_outputs)
 
         assert params.structured_outputs is not None
 
         SpyrePlatform.validate_request(tokens_input(prompt_token_ids=[0]), params)
 
-        assert params.structured_outputs is None
+        assert params.structured_outputs is not None
 
-    def test_logs_warning_when_stripping(self, caplog_sendnn_inference):
-        """Test that a warning is logged when stripping structured_outputs."""
+    def test_no_warning_logged_for_structured_outputs(self, caplog_sendnn_inference):
+        """Test that no warning is logged when structured_outputs are present."""
         params = SamplingParams(
             max_tokens=20, structured_outputs=StructuredOutputsParams(json_object=True)
         )
 
         SpyrePlatform.validate_request(tokens_input(prompt_token_ids=[0]), params)
 
-        assert len(caplog_sendnn_inference.records) > 0
-        warning_record = caplog_sendnn_inference.records[0]
-        assert warning_record.levelname == "WARNING"
-        assert "Structured outputs" in warning_record.message
-        assert "not supported" in warning_record.message
+        warning_records = [r for r in caplog_sendnn_inference.records if r.levelname == "WARNING"]
+        assert not any(
+            "Structured outputs" in r.message and "not supported" in r.message
+            for r in warning_records
+        )
 
     @pytest.mark.parametrize(
         "structured_output",
@@ -66,18 +65,18 @@ def test_logs_warning_when_stripping(self, caplog_sendnn_inference):
             StructuredOutputsParams(regex="[0-9]+"),
         ],
     )
-    def test_strips_different_structured_output_types(self, structured_output):
-        """Test validation with different types of structured outputs."""
+    def test_preserves_different_structured_output_types(self, structured_output):
+        """Test validation preserves different types of structured outputs."""
         params = SamplingParams(max_tokens=20, structured_outputs=structured_output)
 
         assert params.structured_outputs is not None
 
         SpyrePlatform.validate_request(tokens_input(prompt_token_ids=[0]), params)
 
-        assert params.structured_outputs is None
+        assert params.structured_outputs is not None
 
     def test_preserves_other_sampling_params(self):
-        """Test that other sampling params are not affected by the fix."""
+        """Test that other sampling params are not affected by validation."""
         params = SamplingParams(
             max_tokens=20,
             temperature=0.5,
@@ -96,13 +95,12 @@ def test_preserves_other_sampling_params(self):
 
         SpyrePlatform.validate_request(tokens_input(prompt_token_ids=[0]), params)
 
-        # Verify other params are unchanged
+        # Verify all params are unchanged
         assert params.max_tokens == original_values["max_tokens"]
         assert params.temperature == original_values["temperature"]
         assert params.top_p == original_values["top_p"]
         assert params.top_k == original_values["top_k"]
-        # But structured_outputs should be None
-        assert params.structured_outputs is None
+        assert params.structured_outputs is not None
 
     def test_does_not_affect_pooling_params(self):
         """Test that PoolingParams are not affected (early return in validate_request)."""
diff --git a/tests/v1/core/test_scheduler_structured_outputs.py b/tests/v1/core/test_scheduler_structured_outputs.py
index 279dab629..17d830f29 100644
--- a/tests/v1/core/test_scheduler_structured_outputs.py
+++ b/tests/v1/core/test_scheduler_structured_outputs.py
@@ -1,7 +1,8 @@
 """Unit tests for scheduler handling of structured outputs.
 
-Tests the fix in sendnn_inference/v1/core/scheduler.py that strips
-structured_output_request from Request objects in the chunked prefill scheduler.
+Tests the structured output support in sendnn_inference/v1/core/scheduler.py that
+preserves structured_output_request on Request objects and attaches grammar
+output via _spyre_grammar_output attribute in the chunked prefill scheduler.
 
 These unit tests mock the scheduler dependencies and call the actual schedule() method.
 """
@@ -48,20 +49,22 @@ def mocked_scheduler():
 
     # Mock the base scheduler's schedule method and can_schedule_prefill,
     # but ChunkedPrefillSpyreScheduler.schedule uses the code implementation
+    mock_output = Mock()
+    mock_output.has_structured_output_requests = False
+    mock_output.num_scheduled_tokens = {}
+
     with (
         patch.object(ChunkedPrefillSpyreScheduler, "can_schedule_prefill", return_value=True),
-        patch("vllm.v1.core.sched.scheduler.Scheduler.schedule", return_value=Mock()),
+        patch("vllm.v1.core.sched.scheduler.Scheduler.schedule", return_value=mock_output),
     ):
         yield scheduler
 
 
 class TestSchedulerStructuredOutputHandling:
-    """Test that the scheduler strips structured_output_request from requests."""
+    """Test that the scheduler preserves structured_output_request on requests."""
 
-    def test_scheduler_strips_structured_output_request(
-        self, mocked_scheduler, caplog_sendnn_inference
-    ):
-        """Test that the scheduler removes structured_output_request from new requests."""
+    def test_scheduler_preserves_structured_output_request(self, mocked_scheduler):
+        """Test that the scheduler preserves structured_output_request on requests."""
 
         # Create a request with structured outputs
         sampling_params = SamplingParams(
@@ -88,15 +91,8 @@ def test_scheduler_strips_structured_output_request(
         # Call the actual schedule method
         mocked_scheduler.schedule()
 
-        # Verify structured_output_request was stripped
-        assert request.structured_output_request is None
-        assert request.status == RequestStatus.WAITING
-
-        # Verify warning was logged
-        assert any(
-            "Removing structured output" in record.message
-            for record in caplog_sendnn_inference.records
-        )
+        # Verify structured_output_request is preserved
+        assert request.structured_output_request is not None
 
     def test_scheduler_handles_request_without_structured_output(self, mocked_scheduler):
         """Test that requests without structured_output_request are unaffected."""
@@ -128,10 +124,8 @@ def test_scheduler_handles_request_without_structured_output(self, mocked_schedu
         assert request.structured_output_request is None
         # Status may have changed due to base scheduler, but that's OK
 
-    def test_scheduler_handles_multiple_requests_with_structured_outputs(
-        self, mocked_scheduler, caplog_sendnn_inference
-    ):
-        """Test that multiple requests with structured outputs are all stripped."""
+    def test_scheduler_handles_multiple_requests_with_structured_outputs(self, mocked_scheduler):
+        """Test that multiple requests with structured outputs are all preserved."""
 
         # Create multiple requests with structured outputs
         requests = []
@@ -161,55 +155,12 @@ def test_scheduler_handles_multiple_requests_with_structured_outputs(
         # Call the actual schedule method
         mocked_scheduler.schedule()
 
-        # Verify all were stripped
+        # Verify all are preserved
         for request in requests:
-            assert request.structured_output_request is None
-            assert request.status == RequestStatus.WAITING
-
-        # Verify warnings were logged for each request
-        warning_count = sum(
-            1
-            for record in caplog_sendnn_inference.records
-            if "Removing structured output" in record.message
-        )
-        assert warning_count == 3
-
-    def test_scheduler_only_strips_when_can_schedule_prefill_true(self, mocked_scheduler):
-        """Test that structured_output_request is only stripped when request can be scheduled."""
-
-        # Create a request with structured outputs
-        sampling_params = SamplingParams(
-            max_tokens=20,
-            temperature=0.0,
-            structured_outputs=StructuredOutputsParams(json_object=True),
-        )
-
-        request = Request(
-            request_id="test_req",
-            sampling_params=sampling_params,
-            prompt_token_ids=list(range(50)),
-            arrival_time=0,
-            lora_request=None,
-            pooling_params=None,
-        )
-
-        # Verify structured_output_request is set
-        assert request.structured_output_request is not None
-
-        # Add request to waiting queue
-        mocked_scheduler.waiting.append(request)
-        # Mock can_schedule_prefill to return False (request cannot be scheduled)
-        with patch.object(ChunkedPrefillSpyreScheduler, "can_schedule_prefill", return_value=False):
-            # Call the actual schedule method
-            mocked_scheduler.schedule()
+            assert request.structured_output_request is not None
 
-        # Verify structured_output_request was NOT stripped (request wasn't scheduled)
-        assert request.structured_output_request is not None
-
-    def test_scheduler_preserves_other_request_attributes(
-        self, mocked_scheduler, caplog_sendnn_inference
-    ):
-        """Test that other request attributes are not affected when stripping."""
+    def test_scheduler_preserves_other_request_attributes(self, mocked_scheduler):
+        """Test that other request attributes are not affected by scheduling."""
 
         sampling_params = SamplingParams(
             max_tokens=20,
@@ -243,9 +194,9 @@ def test_scheduler_preserves_other_request_attributes(
         assert request.prompt_token_ids == original_prompt_tokens
         assert request.arrival_time == original_arrival_time
         assert request.sampling_params is original_sampling_params
-        # But structured_output_request should be None
-        assert request.structured_output_request is None
-        assert request.status == RequestStatus.WAITING
+        # structured_output_request is preserved
+        assert request.structured_output_request is not None
+        assert request.status == RequestStatus.WAITING_FOR_STRUCTURED_OUTPUT_GRAMMAR
 
 
 # Made with Bob
diff --git a/uv.lock b/uv.lock
index 4dfda5fdc..0d61acd98 100644
--- a/uv.lock
+++ b/uv.lock
@@ -22,6 +22,7 @@ resolution-markers = [
 overrides = [
     { name = "compressed-tensors", specifier = "==0.14.0.1" },
     { name = "intel-extension-for-pytorch", marker = "sys_platform == 'never'" },
+    { name = "llguidance", specifier = ">=1.7.3" },
     { name = "llvmlite", marker = "platform_machine not in 's390x, ppc64le'", specifier = "==0.44.0" },
     { name = "opencv-python-headless", specifier = "==4.12.0.88" },
     { name = "pyarrow", marker = "platform_machine not in 's390x, ppc64le'" },
@@ -1622,15 +1623,25 @@ wheels = [
 
 [[package]]
 name = "llguidance"
-version = "1.3.0"
+version = "1.7.3"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/95/48/3f7a9d3ff1b36bba92b5107a3a21286821227afe9ea464736133994d61fb/llguidance-1.3.0.tar.gz", hash = "sha256:861249afd51dc325646834462ea827e57a5c2b2042e108e6aae7059fdad9104d", size = 1070460, upload-time = "2025-10-20T19:58:44.164Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/f0/1b/d23007f94b74a8465a8a12602579aca5f9cf4bf868dab5fd5b2d61a233ee/llguidance-1.7.3.tar.gz", hash = "sha256:b97ba454c723d70d3b036dea7ef7f2de376d0bd81ab3d99502cc1efe373b03ec", size = 1153440, upload-time = "2026-04-20T21:15:08.781Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/3b/33/be5acb85cd8cdc4afde33d9c234eece9f318e087920255af3c05864cd3e7/llguidance-1.3.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:f7685222660a762e481ac633d49cc559c64980fe2ee59c8f932a5bb5cbc0c2c2", size = 3220647, upload-time = "2025-10-20T19:58:42.542Z" },
-    { url = "https://files.pythonhosted.org/packages/82/e6/b48bda5b15efeaeb62bd0dba8fc6a01d4ae5457a85dbb5d18632385fe15c/llguidance-1.3.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:098030ff0687261a3f1bd54cf21fe951fc861d56d37a0671250dd36677eaf224", size = 3099830, upload-time = "2025-10-20T19:58:40.826Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/11/44389d3d1526d7a5c38ffd587a5ebc61d7bee443ac1dea95f2089ad58f5f/llguidance-1.3.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6f6caca5d78db7f76e1fbb0fff8607b861c32d47fa3d5dee2fc49de27ee269df", size = 2835242, upload-time = "2025-10-20T19:58:34.518Z" },
-    { url = "https://files.pythonhosted.org/packages/83/a8/1ff2bedb8f9acb46a2d2d603415d272bb622c142ea86f5b95445cc6e366c/llguidance-1.3.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc17e9dd602c3879bf91664a64bf72f54c74dbfbeb24ccfab6a5fe435b12f7aa", size = 3033133, upload-time = "2025-10-20T19:58:38.721Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/7e/809349638231f469b9056c0e1bfd924d5ef5558b3b3ec72d093b6fad33b1/llguidance-1.3.0-cp39-abi3-win_amd64.whl", hash = "sha256:1d1cd1c8618d1a13605d3e057c978651e551c8c469b481ee4041f1d6c436002d", size = 2789946, upload-time = "2025-10-20T19:58:45.958Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/f1/818e93b059bf00219cfcaa1157b176492e03a5f488cfe159566f7e6e5fde/llguidance-1.7.3-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:e2eeeeb54bb033bc070c828eba6a0644644756a6e4ce4898d5cd79caf2462390", size = 3240288, upload-time = "2026-04-20T21:14:42.984Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/c9/896f56f36673b230d32af473d957dfd8e979d66879ebf2eaf699257f572f/llguidance-1.7.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:fe42358c2fb476b69789c555743e0d12fb354d6ddf8bbd2c669c710ee4e2cbb2", size = 3144890, upload-time = "2026-04-20T21:14:44.868Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/ea/ddc889167111d00a91cb4f9b6cbc9e451786439a13c5522b01be479ae01c/llguidance-1.7.3-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc74a9390418d9a2ed7811f243eb1417842aa23a850233914311a51bbcea29ee", size = 3470103, upload-time = "2026-04-20T21:14:46.624Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/f3/79db5a135f5d587d0f589004e2352f77f127cc92db352dba3904685ad88b/llguidance-1.7.3-cp314-cp314t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c578a077a7b87c9ef57717dc9fa61513cbb3ebb30c5b58acd0c18b2fb627187", size = 3760709, upload-time = "2026-04-20T21:14:48.191Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/42/9542dae2efcbd83940bd9dbe7b65ad09342608d509535835c005b32d0a0a/llguidance-1.7.3-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1cbf32aa6cfbf0fefde5b8ea98903b1c29f4ac8216c082c740f7592618fff7f8", size = 3490209, upload-time = "2026-04-20T21:14:50.138Z" },
+    { url = "https://files.pythonhosted.org/packages/db/3a/fd453c7df03633f35c0e6ef4b9679df28ea6cd9983e4480710b710d2f834/llguidance-1.7.3-cp314-cp314t-win32.whl", hash = "sha256:d3ae30aa74ebb9727ad2649fbcdabf55c6dd4c7dd424094539f8508d780f70e8", size = 2600881, upload-time = "2026-04-20T21:14:51.636Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/7d/82ca82290f80a89de9fb6d64f9208917e13aa0a114619ffea0647c1cdbe6/llguidance-1.7.3-cp314-cp314t-win_amd64.whl", hash = "sha256:418f34ff6e1ec96cab89b13dcc15b1a696eb36aba90fe49e3fa06b593ddedb28", size = 2867893, upload-time = "2026-04-20T21:14:53.493Z" },
+    { url = "https://files.pythonhosted.org/packages/29/19/31349c112f1dd62e2e1613dd5fee10419fd67ca3497332b802fc98a9b4f6/llguidance-1.7.3-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:d441049d75286f60d55b28a73483ad02ffb742ef9b33ac8efa98b29c698897ec", size = 3248781, upload-time = "2026-04-20T21:14:55.023Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/d6/e8a6b4a17a0ddcd19dd522bc0e479939d3775456ba15116fc8cf51a3b1b0/llguidance-1.7.3-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:76aced208b63d732f15cc848765023c9e91e492f80ed3458b407f023a07761ff", size = 3149797, upload-time = "2026-04-20T21:14:56.9Z" },
+    { url = "https://files.pythonhosted.org/packages/31/c5/dc9156786739a267978b0b5e593b1d61e92db97f76b00794f625def8a176/llguidance-1.7.3-cp39-abi3-manylinux_2_31_aarch64.whl", hash = "sha256:1a35f8296159d7cecc488e702f698658da4b6a0eeca421f1f1bf1349d00bb0c7", size = 2891142, upload-time = "2026-04-20T21:14:58.747Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/c5/4564d131a02caefcf4876c26c33d05a35fddf53980b4c86fce31fdb27088/llguidance-1.7.3-cp39-abi3-manylinux_2_31_x86_64.whl", hash = "sha256:50dd687a5d944c898dfe66af7af86c7591a0425fe84c4862b8f5549582106732", size = 3083627, upload-time = "2026-04-20T21:15:00.592Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/05/201350217c781d6ad131af4fbb5644a551de1b9012a734bccc44300495f9/llguidance-1.7.3-cp39-abi3-manylinux_2_34_i686.whl", hash = "sha256:87b2019ff00b463558f6731b82246095bab202c45291a6038933bc75efbc674a", size = 3341351, upload-time = "2026-04-20T21:15:02.243Z" },
+    { url = "https://files.pythonhosted.org/packages/97/50/f89f4ba15ead1e472ace2b919adf28e9fb88334f6c5796e46ee1937def7c/llguidance-1.7.3-cp39-abi3-manylinux_2_39_riscv64.whl", hash = "sha256:0ca0ba2985a09a74c6572d72d0f1885c90f01276b079f32ab0fbe41818dfc955", size = 3611737, upload-time = "2026-04-20T21:15:04.264Z" },
+    { url = "https://files.pythonhosted.org/packages/59/b2/bd822432b8be03cc610790708ba6516a02dd4d1090f3dad95c74d8cdc77b/llguidance-1.7.3-cp39-abi3-win32.whl", hash = "sha256:ff5c6e16727fb1d72609600b8ec94023a57a13799583b9ca77f8de56c4425df9", size = 2613685, upload-time = "2026-04-20T21:15:05.738Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/97/2a488c3e696e3fb22e0bc5e07248736626f7ad92c0caf9d88dbc7866c6bd/llguidance-1.7.3-cp39-abi3-win_amd64.whl", hash = "sha256:c2f3d5f369fb74dc7ecdc4f686b15ec5522c6b81a903024078f9a6ab9b2dc1f4", size = 2873297, upload-time = "2026-04-20T21:15:07.451Z" },
 ]
 
 [[package]]
@@ -4657,7 +4668,7 @@ dependencies = [
     { name = "ijson" },
     { name = "intel-openmp", marker = "platform_machine == 'x86_64'" },
     { name = "lark" },
-    { name = "llguidance", marker = "platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'ppc64le' or platform_machine == 's390x' or platform_machine == 'x86_64'" },
+    { name = "llguidance" },
     { name = "lm-format-enforcer" },
     { name = "mcp" },
     { name = "mistral-common", extra = ["image"] },