diff --git a/docs/user_guide/supported_features.md b/docs/user_guide/supported_features.md index f6659462d..ca0572120 100644 --- a/docs/user_guide/supported_features.md +++ b/docs/user_guide/supported_features.md @@ -8,7 +8,7 @@ This table summarize the status of features on Spyre. By default, those features | Automatic Prefix Caching | ✅ | | LoRA | ⛔ | | Speculative Decoding | ⛔ | -| Guided Decoding | ⛔ | +| Guided Decoding | ✅ | | Enc-dec | ⛔ | | Multi Modality | ⚠️ | | LogProbs | ✅ | diff --git a/pyproject.toml b/pyproject.toml index 1ef85f073..357367620 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,6 +79,12 @@ override-dependencies = [ # vllm 0.18.0 pins compressed-tensors==0.13.0; override to 0.14.0.1 # TODO: remove once minimum vllm is bumped past 0.18.0 "compressed-tensors==0.14.0.1", + + # llguidance>=1.7.3 fixes s390x endianness issues. + # This conflicts with vLLM's version range (llguidance >= 1.3.0, < 1.4.0). + # TODO: Remove this override once vLLM's requirement range includes or moves past 1.7.3. + # See: https://github.com/vllm-project/vllm/blob/v0.19.1/requirements/common.txt#L22 + "llguidance>=1.7.3" ] # This adds constraints to all dependent build environments, which will ensure everything is built # with the same version of torch. This CANNOT conflict with a package's existing build dependencies diff --git a/sendnn_inference/platform.py b/sendnn_inference/platform.py index dbb24c5f5..16a6e2903 100644 --- a/sendnn_inference/platform.py +++ b/sendnn_inference/platform.py @@ -450,15 +450,6 @@ def validate_request( if params.prompt_logprobs is not None: raise ValueError("Prompt logprobs are currently not supported.") - # Structured Outputs are not supported yet and cause issues in our - # scheduler if included in the request - if params.structured_outputs is not None: - logger.warning( - "Structured outputs are currently not supported and " - "will be stripped from the request." - ) - params.structured_outputs = None - if "encoder_prompt" in processed_inputs: raise ValueError("Encoder-decoder models not supported ") if "prompt_token_ids" not in processed_inputs: diff --git a/sendnn_inference/v1/core/scheduler.py b/sendnn_inference/v1/core/scheduler.py index 61b0a8d13..fd88ba15d 100644 --- a/sendnn_inference/v1/core/scheduler.py +++ b/sendnn_inference/v1/core/scheduler.py @@ -116,6 +116,7 @@ def schedule(self) -> SchedulerOutput: while holdback_queue: self.waiting.append(holdback_queue.popleft()) + outputs._spyre_grammar_output = self.get_grammar_bitmask(outputs) # type: ignore[attr-defined] return outputs def _get_matching_warmup_shapes( @@ -260,22 +261,6 @@ def schedule(self) -> "SchedulerOutput": while holdback_queue: if self.can_schedule_prefill(holdback_queue[0]): new_request = holdback_queue.popleft() - # Remove structured_output_request - # NB: SpyrePlatform.validate_request() removes structured_output - # before the request gets here in most cases - # TODO: We don't currently support structured output and it - # breaks some assumptions the code makes. The problems is that - # a structured output request will stay in waiting for multiple - # iterations with status WAITING_FOR_FSM. To handle this - # properly we need to exclude such requests from entering - # ongoing_prefills but still pass them in the waiting queue to - # the base scheduler to track the FSM initialization. - if new_request.structured_output_request is not None: - logger.warning( - "Removing structured output from request: %s", new_request.request_id - ) - new_request.structured_output_request = None - new_request.status = RequestStatus.WAITING logger.debug( "Scheduling a new request (%d prompt tokens), holding back %d requests", @@ -300,6 +285,8 @@ def schedule(self) -> "SchedulerOutput": "Ongoing prefill requests must be in the running queue." ) + new_prefill_candidates: list[Request] = [] + # Check ongoing prefills if self.ongoing_prefills: # Some running requests are currently being prefilled. We need to @@ -322,11 +309,25 @@ def schedule(self) -> "SchedulerOutput": # Check new requests to prefill elif len(self.waiting) > 0: - self.ongoing_prefills.extend(self.waiting) - # Hide current decodes from the scheduler - running_holdback = self.running - self.running = [] - self.previous_step_was_prefill = True + # Separate requests that are ready to prefill from those waiting + # for grammar FSM initialization (WAITING_FOR_STRUCTURED_OUTPUT_GRAMMAR). Only + # ready requests should hide the decode batch. + ready_to_prefill = [ + r + for r in self.waiting + if r.status != RequestStatus.WAITING_FOR_STRUCTURED_OUTPUT_GRAMMAR # type: ignore[attr-defined] + ] + if ready_to_prefill: + new_prefill_candidates = list(self.waiting) + # Hide current decodes from the scheduler + running_holdback = self.running + self.running = [] + self.previous_step_was_prefill = True + else: + # Only WAITING_FOR_STRUCTURED_OUTPUT_GRAMMAR requests — let decodes continue and + # pass FSM-waiting requests through to the base scheduler. + running_holdback = [] + self.previous_step_was_prefill = False else: self.previous_step_was_prefill = False running_holdback = [] @@ -334,6 +335,13 @@ def schedule(self) -> "SchedulerOutput": # delegate to super of SpyreScheduler: base V1 Scheduler outputs = super(SpyreScheduler, self).schedule() + # Track as ongoing prefills only the requests that were actually + # scheduled (i.e., moved from waiting to running by the base + # scheduler). Structured output requests in WAITING_FOR_STRUCTURED_OUTPUT_GRAMMAR status + # are kept in self.waiting until their grammar FSM is ready. + if new_prefill_candidates: + self.ongoing_prefills.extend(r for r in new_prefill_candidates if r in self.running) + # restore holdbacks after running the base scheduler self.running = self.running + running_holdback while holdback_queue: @@ -345,6 +353,15 @@ def schedule(self) -> "SchedulerOutput": r.num_computed_tokens <= r.num_prompt_tokens + 1 for r in self.running ): logger.debug("Scheduled tokens in this step: %s", outputs.num_scheduled_tokens) + + # Collect grammar bitmask synchronously for structured outputs. + # NOTE: This is done here because vllm-spyre currently combines token sampling + # in model_executor.execute_model() rather than implementing sample_tokens() + # in the model runner. This means we cannot collect the grammar bitmask + # asynchronously while the model is running (as done in vLLM core). + # TODO: Implement sample_tokens() in SpyreModelRunner to enable async grammar + # collection for better performance. + outputs._spyre_grammar_output = self.get_grammar_bitmask(outputs) # type: ignore[attr-defined] return outputs def can_schedule_prefill(self, request: Request) -> bool: diff --git a/sendnn_inference/v1/worker/spyre_model_runner.py b/sendnn_inference/v1/worker/spyre_model_runner.py index 50d8f191b..17781328b 100644 --- a/sendnn_inference/v1/worker/spyre_model_runner.py +++ b/sendnn_inference/v1/worker/spyre_model_runner.py @@ -23,6 +23,9 @@ from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, ModelRunnerOutput, SamplerOutput from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.request import Request +from vllm.v1.structured_output.utils import ( + apply_grammar_bitmask as vllm_apply_grammar_bitmask, +) import sendnn_inference.envs as envs_spyre import sendnn_inference.utils as utils_spyre @@ -1502,6 +1505,26 @@ def is_cached_chunk(self, scheduler_output: SchedulerOutput): return True return False + def apply_grammar_bitmask( + self, + scheduler_output: "SchedulerOutput", + logits: torch.Tensor, + batch: SamplingInputBatch, + ) -> None: + """Apply grammar bitmask in-place to constrain logits for structured + output requests. + """ + grammar_output = getattr(scheduler_output, "_spyre_grammar_output", None) + if grammar_output is None: + return + + vllm_apply_grammar_bitmask( + scheduler_output, + grammar_output, + batch, # type: ignore[arg-type] + logits, + ) + @SpyrePlatform.inference_mode() def execute_model( self, @@ -1558,6 +1581,13 @@ def execute_model( logger.debug("t_forward_pass: %.2fms [prefill single chunk][batch size 1]", (t1 * 1000)) return self.prefill_output() + # Apply grammar bitmask for structured output requests. + self.apply_grammar_bitmask( + scheduler_output, + logits, + self.prefill_batch if is_prefill else self.input_batch, + ) + # Sample the next token. output: SamplerOutput | None = self.model.sample( logits=logits, diff --git a/tests/e2e/test_structured_outputs.py b/tests/e2e/test_structured_outputs.py new file mode 100644 index 000000000..cde1f32d8 --- /dev/null +++ b/tests/e2e/test_structured_outputs.py @@ -0,0 +1,278 @@ +"""End-to-end tests for structured output decoding. + +Tests structured output support across different backends (guidance, xgrammar, outlines) +and ensures that prompts without structured output requests don't accidentally have them applied. +""" + +import json +import pytest +import re +from llm_cache import get_llm +from spyre_util import ModelInfo +from vllm import SamplingParams +from vllm.sampling_params import StructuredOutputsParams +from vllm.config import StructuredOutputsConfig + +pytestmark = [pytest.mark.chunked_prefill] + + +# Parametrize all tests over the three structured output backends +# Note: Backend support varies by feature: +# - guidance: supports json_object, json (schema), regex, choice +# - xgrammar: supports json_object, json (schema), regex, choice +# - outlines: supports json (schema), regex, choice (NOT json_object) +STRUCTURED_OUTPUT_BACKENDS = ["guidance", "xgrammar", "outlines"] + +# Backends that support json_object (free-form JSON without schema) +JSON_OBJECT_BACKENDS = ["guidance", "xgrammar"] # outlines requires schema + + +@pytest.mark.parametrize("structured_output_backend", JSON_OBJECT_BACKENDS) +def test_structured_output_json_object( + model: ModelInfo, + backend, + monkeypatch, + max_model_len, + max_num_seqs, + max_num_batched_tokens, + use_llm_cache, + structured_output_backend: str, +): + """Test that structured output with json_object=True produces valid JSON.""" + spyre_model = get_llm( + model=model, + max_model_len=max_model_len, + backend=backend, + monkeypatch=monkeypatch, + max_num_seqs=max_num_seqs, + max_num_batched_tokens=max_num_batched_tokens, + use_pc=True, + cached=True, + structured_outputs_config=StructuredOutputsConfig(backend=structured_output_backend), + ) + + prompt = "Generate a JSON object with name and age fields for a person." + + params = SamplingParams( + temperature=0.0, + max_tokens=50, + structured_outputs=StructuredOutputsParams(json_object=True), + ) + + outputs = spyre_model.generate([prompt], [params]) + output_text = outputs[0].outputs[0].text + + # Verify output is valid JSON + try: + json_obj = json.loads(output_text) + assert isinstance(json_obj, dict), "Output should be a JSON object" + except json.JSONDecodeError as e: + pytest.fail(f"Output is not valid JSON: {output_text}\nError: {e}") + + +@pytest.mark.parametrize("structured_output_backend", STRUCTURED_OUTPUT_BACKENDS) +def test_structured_output_json_schema( + model: ModelInfo, + backend, + monkeypatch, + max_model_len, + max_num_seqs, + max_num_batched_tokens, + use_llm_cache, + structured_output_backend: str, +): + """Test that structured output with a JSON schema validates correctly.""" + spyre_model = get_llm( + model=model, + max_model_len=max_model_len, + backend=backend, + monkeypatch=monkeypatch, + max_num_seqs=max_num_seqs, + max_num_batched_tokens=max_num_batched_tokens, + use_pc=True, + cached=True, + structured_outputs_config=StructuredOutputsConfig(backend=structured_output_backend), + ) + + schema = { + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "integer"}, + }, + "required": ["name", "age"], + "additionalProperties": False, + } + + prompt = "Generate a person with name and age only." + + params = SamplingParams( + temperature=0.0, + max_tokens=50, + structured_outputs=StructuredOutputsParams(json=schema), + ) + + outputs = spyre_model.generate([prompt], [params]) + output_text = outputs[0].outputs[0].text + + # Verify output is valid JSON matching the schema + try: + json_obj = json.loads(output_text) + assert isinstance(json_obj, dict), "Output should be a JSON object" + assert "name" in json_obj, "Output should have 'name' field" + assert "age" in json_obj, "Output should have 'age' field" + assert isinstance(json_obj["name"], str), "'name' should be a string" + assert isinstance(json_obj["age"], int), "'age' should be an integer" + except json.JSONDecodeError as e: + pytest.fail(f"Output is not valid JSON: {output_text}\nError: {e}") + + +@pytest.mark.parametrize("structured_output_backend", STRUCTURED_OUTPUT_BACKENDS) +def test_structured_output_regex( + model: ModelInfo, + backend, + monkeypatch, + max_model_len, + max_num_seqs, + max_num_batched_tokens, + use_llm_cache, + structured_output_backend: str, +): + """Test that structured output with regex pattern is enforced.""" + spyre_model = get_llm( + model=model, + max_model_len=max_model_len, + backend=backend, + monkeypatch=monkeypatch, + max_num_seqs=max_num_seqs, + max_num_batched_tokens=max_num_batched_tokens, + use_pc=True, + cached=True, + structured_outputs_config=StructuredOutputsConfig(backend=structured_output_backend), + ) + + # Regex for phone number format: XXX-XXX-XXXX + phone_regex = r"\d{3}-\d{3}-\d{4}" + + prompt = "Generate a phone number in XXX-XXX-XXXX format." + + params = SamplingParams( + temperature=0.0, + max_tokens=20, + structured_outputs=StructuredOutputsParams(regex=phone_regex), + ) + + outputs = spyre_model.generate([prompt], [params]) + output_text = outputs[0].outputs[0].text.strip() + + # Verify output matches the regex pattern + match = re.fullmatch(phone_regex, output_text) + assert match is not None, f"Output '{output_text}' does not match regex pattern '{phone_regex}'" + + +@pytest.mark.parametrize("structured_output_backend", JSON_OBJECT_BACKENDS) +def test_structured_output_mixed_batch( + model: ModelInfo, + backend, + monkeypatch, + max_model_len, + max_num_seqs, + max_num_batched_tokens, + use_llm_cache, + structured_output_backend: str, +): + """Test that requests with and without structured outputs can coexist. + + This is critical to ensure that prompts without structured output requests don't + accidentally have structured outputs applied. Due to chunked prefill constraints, + we submit requests sequentially with the same model. + """ + spyre_model = get_llm( + model=model, + max_model_len=max_model_len, + backend=backend, + monkeypatch=monkeypatch, + max_num_seqs=max_num_seqs, + max_num_batched_tokens=max_num_batched_tokens, + use_pc=True, + cached=True, + structured_outputs_config=StructuredOutputsConfig(backend=structured_output_backend), + ) + + # Request with structured output (JSON object) + prompt_structured = "Generate a JSON object with name and age." + params_structured = SamplingParams( + temperature=0.0, + max_tokens=50, + structured_outputs=StructuredOutputsParams(json_object=True), + ) + + # Request without structured output (free-form text) + prompt_freeform = "Write a short story about a cat." + params_freeform = SamplingParams( + temperature=0.0, + max_tokens=50, + ) + + # Generate with structured output first + output_structured = spyre_model.generate([prompt_structured], [params_structured])[0] + output_structured_text = output_structured.outputs[0].text + + # Verify structured output is valid JSON + try: + json_obj = json.loads(output_structured_text) + assert isinstance(json_obj, dict), "Structured output should be a JSON object" + except json.JSONDecodeError as e: + pytest.fail(f"Structured output is not valid JSON: {output_structured_text}\nError: {e}") + + # Generate without structured output + output_freeform = spyre_model.generate([prompt_freeform], [params_freeform])[0] + output_freeform_text = output_freeform.outputs[0].text + + # Verify freeform output is not constrained (just has text) + assert len(output_freeform_text) > 0, "Freeform output should have text" + # Don't enforce JSON structure - it should be free-form story text + + +@pytest.mark.parametrize("structured_output_backend", STRUCTURED_OUTPUT_BACKENDS) +def test_structured_output_choice( + model: ModelInfo, + backend, + monkeypatch, + max_model_len, + max_num_seqs, + max_num_batched_tokens, + use_llm_cache, + structured_output_backend: str, +): + """Test that structured output with choice constraint works correctly.""" + spyre_model = get_llm( + model=model, + max_model_len=max_model_len, + backend=backend, + monkeypatch=monkeypatch, + max_num_seqs=max_num_seqs, + max_num_batched_tokens=max_num_batched_tokens, + use_pc=True, + cached=True, + structured_outputs_config=StructuredOutputsConfig(backend=structured_output_backend), + ) + + choices = ["yes", "no", "maybe"] + + prompt = "Is the sky blue? Answer with yes, no, or maybe." + + params = SamplingParams( + temperature=0.0, + max_tokens=10, + structured_outputs=StructuredOutputsParams(choice=choices), + ) + + outputs = spyre_model.generate([prompt], [params]) + output_text = outputs[0].outputs[0].text.strip().lower() + + # Verify output is one of the allowed choices + assert output_text in choices, f"Output '{output_text}' not in allowed choices {choices}" + + +# Made with Bob diff --git a/tests/llm_cache.py b/tests/llm_cache.py index a7bf21816..806fafdfc 100644 --- a/tests/llm_cache.py +++ b/tests/llm_cache.py @@ -112,6 +112,7 @@ def get_cached_llm( max_num_seqs: int | None = None, use_pc: bool = False, max_num_batched_tokens: int | None = None, + structured_outputs_config=None, ) -> LLM: """Creates an LLM with the provided runtime configuration. @@ -126,6 +127,8 @@ def get_cached_llm( "use_pc": use_pc, "max_num_batched_tokens": max_num_batched_tokens, } + if structured_outputs_config is not None: + runtime_config["structured_outputs_config"] = structured_outputs_config if warmup_shapes: runtime_config.update({"warmup_shapes": tuple(warmup_shapes)}) else: @@ -152,6 +155,7 @@ def get_cached_llm( tensor_parallel_size=tensor_parallel_size, max_num_batched_tokens=max_num_batched_tokens, enable_prefix_caching=use_pc, + structured_outputs_config=structured_outputs_config, ), ) @@ -167,6 +171,7 @@ def _create_llm( max_num_seqs: int | None, max_num_batched_tokens: int | None, enable_prefix_caching: bool, + structured_outputs_config=None, ) -> LLM: if isinstance(model, ModelInfo): model_name = model.name @@ -175,18 +180,22 @@ def _create_llm( model_name = model revision = None - return LLM( - model=model_name, - tokenizer=model_name, - revision=revision, - tokenizer_revision=revision, - max_model_len=max_model_len, - max_num_seqs=max_num_seqs, - tensor_parallel_size=tensor_parallel_size, - max_num_batched_tokens=max_num_batched_tokens, - logits_processors=[GoldenTokenInjector], - enable_prefix_caching=enable_prefix_caching, - ) + llm_kwargs = { + "model": model_name, + "tokenizer": model_name, + "revision": revision, + "tokenizer_revision": revision, + "max_model_len": max_model_len, + "max_num_seqs": max_num_seqs, + "tensor_parallel_size": tensor_parallel_size, + "max_num_batched_tokens": max_num_batched_tokens, + "logits_processors": [GoldenTokenInjector], + "enable_prefix_caching": enable_prefix_caching, + } + if structured_outputs_config is not None: + llm_kwargs["structured_outputs_config"] = structured_outputs_config + + return LLM(**llm_kwargs) class EngineCache: @@ -407,6 +416,7 @@ def get_llm( max_num_batched_tokens: int | None = None, use_pc: bool = False, cached: bool = True, + structured_outputs_config=None, ) -> LLM: # Clear other caches first API_SERVER_CACHE.clear() @@ -423,6 +433,7 @@ def get_llm( max_num_seqs=max_num_seqs, use_pc=use_pc, max_num_batched_tokens=max_num_batched_tokens, + structured_outputs_config=structured_outputs_config, ) patch_environment( @@ -439,6 +450,7 @@ def get_llm( max_num_seqs=max_num_seqs, max_num_batched_tokens=max_num_batched_tokens, enable_prefix_caching=use_pc, + structured_outputs_config=structured_outputs_config, ) diff --git a/tests/llm_cache_util.py b/tests/llm_cache_util.py index f389f51ef..c4765f79c 100644 --- a/tests/llm_cache_util.py +++ b/tests/llm_cache_util.py @@ -75,6 +75,7 @@ class SortKey(NamedTuple): max_num_seqs: int = 0 num_blocks: int = 0 max_num_batched_tokens: int = 0 + structured_output_backend: str = "" warmup_shapes: EmbeddingWarmupShapes | None = None @staticmethod @@ -112,6 +113,7 @@ def from_item(item) -> "SortKey": use_pc=use_pc, num_blocks=SortKey._get_num_blocks(item), max_num_batched_tokens=SortKey._get_max_num_batched_tokens(item), + structured_output_backend=SortKey._get_structured_output_backend(item), **sort_kwargs, ) @@ -267,6 +269,20 @@ def _get_num_blocks(item) -> int: # Most tests don't use this param return 0 + @staticmethod + def _get_structured_output_backend(item) -> str: + """Extract structured output backend from test parameters.""" + if "structured_output_backend" in item.callspec.params: + backend = item.callspec.params["structured_output_backend"] + SortKey._assert_param( + isinstance(backend, str), + "structured_output_backend must be a string.", + item, + ) + return backend + # Most tests don't use structured outputs + return "" + @staticmethod def _assert_param(condition, message, item): assert condition, ( diff --git a/tests/utils/test_platform_validation.py b/tests/utils/test_platform_validation.py index 8fa80509a..880593e97 100644 --- a/tests/utils/test_platform_validation.py +++ b/tests/utils/test_platform_validation.py @@ -31,33 +31,32 @@ def mock_spyre_config(): class TestStructuredOutputValidation: - """Test that platform validation strips structured outputs from requests.""" + """Test that platform validation passes structured outputs through unchanged.""" - def test_strips_structured_outputs(self): - """Test that validate_request sets structured_outputs to None.""" - params = SamplingParams( - max_tokens=20, structured_outputs=StructuredOutputsParams(json_object=True) - ) + def test_preserves_structured_outputs(self): + """Test that validate_request does not strip structured_outputs.""" + structured_outputs = StructuredOutputsParams(json_object=True) + params = SamplingParams(max_tokens=20, structured_outputs=structured_outputs) assert params.structured_outputs is not None SpyrePlatform.validate_request(tokens_input(prompt_token_ids=[0]), params) - assert params.structured_outputs is None + assert params.structured_outputs is not None - def test_logs_warning_when_stripping(self, caplog_sendnn_inference): - """Test that a warning is logged when stripping structured_outputs.""" + def test_no_warning_logged_for_structured_outputs(self, caplog_sendnn_inference): + """Test that no warning is logged when structured_outputs are present.""" params = SamplingParams( max_tokens=20, structured_outputs=StructuredOutputsParams(json_object=True) ) SpyrePlatform.validate_request(tokens_input(prompt_token_ids=[0]), params) - assert len(caplog_sendnn_inference.records) > 0 - warning_record = caplog_sendnn_inference.records[0] - assert warning_record.levelname == "WARNING" - assert "Structured outputs" in warning_record.message - assert "not supported" in warning_record.message + warning_records = [r for r in caplog_sendnn_inference.records if r.levelname == "WARNING"] + assert not any( + "Structured outputs" in r.message and "not supported" in r.message + for r in warning_records + ) @pytest.mark.parametrize( "structured_output", @@ -66,18 +65,18 @@ def test_logs_warning_when_stripping(self, caplog_sendnn_inference): StructuredOutputsParams(regex="[0-9]+"), ], ) - def test_strips_different_structured_output_types(self, structured_output): - """Test validation with different types of structured outputs.""" + def test_preserves_different_structured_output_types(self, structured_output): + """Test validation preserves different types of structured outputs.""" params = SamplingParams(max_tokens=20, structured_outputs=structured_output) assert params.structured_outputs is not None SpyrePlatform.validate_request(tokens_input(prompt_token_ids=[0]), params) - assert params.structured_outputs is None + assert params.structured_outputs is not None def test_preserves_other_sampling_params(self): - """Test that other sampling params are not affected by the fix.""" + """Test that other sampling params are not affected by validation.""" params = SamplingParams( max_tokens=20, temperature=0.5, @@ -96,13 +95,12 @@ def test_preserves_other_sampling_params(self): SpyrePlatform.validate_request(tokens_input(prompt_token_ids=[0]), params) - # Verify other params are unchanged + # Verify all params are unchanged assert params.max_tokens == original_values["max_tokens"] assert params.temperature == original_values["temperature"] assert params.top_p == original_values["top_p"] assert params.top_k == original_values["top_k"] - # But structured_outputs should be None - assert params.structured_outputs is None + assert params.structured_outputs is not None def test_does_not_affect_pooling_params(self): """Test that PoolingParams are not affected (early return in validate_request).""" diff --git a/tests/v1/core/test_scheduler_structured_outputs.py b/tests/v1/core/test_scheduler_structured_outputs.py index 279dab629..17d830f29 100644 --- a/tests/v1/core/test_scheduler_structured_outputs.py +++ b/tests/v1/core/test_scheduler_structured_outputs.py @@ -1,7 +1,8 @@ """Unit tests for scheduler handling of structured outputs. -Tests the fix in sendnn_inference/v1/core/scheduler.py that strips -structured_output_request from Request objects in the chunked prefill scheduler. +Tests the structured output support in sendnn_inference/v1/core/scheduler.py that +preserves structured_output_request on Request objects and attaches grammar +output via _spyre_grammar_output attribute in the chunked prefill scheduler. These unit tests mock the scheduler dependencies and call the actual schedule() method. """ @@ -48,20 +49,22 @@ def mocked_scheduler(): # Mock the base scheduler's schedule method and can_schedule_prefill, # but ChunkedPrefillSpyreScheduler.schedule uses the code implementation + mock_output = Mock() + mock_output.has_structured_output_requests = False + mock_output.num_scheduled_tokens = {} + with ( patch.object(ChunkedPrefillSpyreScheduler, "can_schedule_prefill", return_value=True), - patch("vllm.v1.core.sched.scheduler.Scheduler.schedule", return_value=Mock()), + patch("vllm.v1.core.sched.scheduler.Scheduler.schedule", return_value=mock_output), ): yield scheduler class TestSchedulerStructuredOutputHandling: - """Test that the scheduler strips structured_output_request from requests.""" + """Test that the scheduler preserves structured_output_request on requests.""" - def test_scheduler_strips_structured_output_request( - self, mocked_scheduler, caplog_sendnn_inference - ): - """Test that the scheduler removes structured_output_request from new requests.""" + def test_scheduler_preserves_structured_output_request(self, mocked_scheduler): + """Test that the scheduler preserves structured_output_request on requests.""" # Create a request with structured outputs sampling_params = SamplingParams( @@ -88,15 +91,8 @@ def test_scheduler_strips_structured_output_request( # Call the actual schedule method mocked_scheduler.schedule() - # Verify structured_output_request was stripped - assert request.structured_output_request is None - assert request.status == RequestStatus.WAITING - - # Verify warning was logged - assert any( - "Removing structured output" in record.message - for record in caplog_sendnn_inference.records - ) + # Verify structured_output_request is preserved + assert request.structured_output_request is not None def test_scheduler_handles_request_without_structured_output(self, mocked_scheduler): """Test that requests without structured_output_request are unaffected.""" @@ -128,10 +124,8 @@ def test_scheduler_handles_request_without_structured_output(self, mocked_schedu assert request.structured_output_request is None # Status may have changed due to base scheduler, but that's OK - def test_scheduler_handles_multiple_requests_with_structured_outputs( - self, mocked_scheduler, caplog_sendnn_inference - ): - """Test that multiple requests with structured outputs are all stripped.""" + def test_scheduler_handles_multiple_requests_with_structured_outputs(self, mocked_scheduler): + """Test that multiple requests with structured outputs are all preserved.""" # Create multiple requests with structured outputs requests = [] @@ -161,55 +155,12 @@ def test_scheduler_handles_multiple_requests_with_structured_outputs( # Call the actual schedule method mocked_scheduler.schedule() - # Verify all were stripped + # Verify all are preserved for request in requests: - assert request.structured_output_request is None - assert request.status == RequestStatus.WAITING - - # Verify warnings were logged for each request - warning_count = sum( - 1 - for record in caplog_sendnn_inference.records - if "Removing structured output" in record.message - ) - assert warning_count == 3 - - def test_scheduler_only_strips_when_can_schedule_prefill_true(self, mocked_scheduler): - """Test that structured_output_request is only stripped when request can be scheduled.""" - - # Create a request with structured outputs - sampling_params = SamplingParams( - max_tokens=20, - temperature=0.0, - structured_outputs=StructuredOutputsParams(json_object=True), - ) - - request = Request( - request_id="test_req", - sampling_params=sampling_params, - prompt_token_ids=list(range(50)), - arrival_time=0, - lora_request=None, - pooling_params=None, - ) - - # Verify structured_output_request is set - assert request.structured_output_request is not None - - # Add request to waiting queue - mocked_scheduler.waiting.append(request) - # Mock can_schedule_prefill to return False (request cannot be scheduled) - with patch.object(ChunkedPrefillSpyreScheduler, "can_schedule_prefill", return_value=False): - # Call the actual schedule method - mocked_scheduler.schedule() + assert request.structured_output_request is not None - # Verify structured_output_request was NOT stripped (request wasn't scheduled) - assert request.structured_output_request is not None - - def test_scheduler_preserves_other_request_attributes( - self, mocked_scheduler, caplog_sendnn_inference - ): - """Test that other request attributes are not affected when stripping.""" + def test_scheduler_preserves_other_request_attributes(self, mocked_scheduler): + """Test that other request attributes are not affected by scheduling.""" sampling_params = SamplingParams( max_tokens=20, @@ -243,9 +194,9 @@ def test_scheduler_preserves_other_request_attributes( assert request.prompt_token_ids == original_prompt_tokens assert request.arrival_time == original_arrival_time assert request.sampling_params is original_sampling_params - # But structured_output_request should be None - assert request.structured_output_request is None - assert request.status == RequestStatus.WAITING + # structured_output_request is preserved + assert request.structured_output_request is not None + assert request.status == RequestStatus.WAITING_FOR_STRUCTURED_OUTPUT_GRAMMAR # Made with Bob diff --git a/uv.lock b/uv.lock index 4dfda5fdc..0d61acd98 100644 --- a/uv.lock +++ b/uv.lock @@ -22,6 +22,7 @@ resolution-markers = [ overrides = [ { name = "compressed-tensors", specifier = "==0.14.0.1" }, { name = "intel-extension-for-pytorch", marker = "sys_platform == 'never'" }, + { name = "llguidance", specifier = ">=1.7.3" }, { name = "llvmlite", marker = "platform_machine not in 's390x, ppc64le'", specifier = "==0.44.0" }, { name = "opencv-python-headless", specifier = "==4.12.0.88" }, { name = "pyarrow", marker = "platform_machine not in 's390x, ppc64le'" }, @@ -1622,15 +1623,25 @@ wheels = [ [[package]] name = "llguidance" -version = "1.3.0" +version = "1.7.3" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/95/48/3f7a9d3ff1b36bba92b5107a3a21286821227afe9ea464736133994d61fb/llguidance-1.3.0.tar.gz", hash = "sha256:861249afd51dc325646834462ea827e57a5c2b2042e108e6aae7059fdad9104d", size = 1070460, upload-time = "2025-10-20T19:58:44.164Z" } +sdist = { url = "https://files.pythonhosted.org/packages/f0/1b/d23007f94b74a8465a8a12602579aca5f9cf4bf868dab5fd5b2d61a233ee/llguidance-1.7.3.tar.gz", hash = "sha256:b97ba454c723d70d3b036dea7ef7f2de376d0bd81ab3d99502cc1efe373b03ec", size = 1153440, upload-time = "2026-04-20T21:15:08.781Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/3b/33/be5acb85cd8cdc4afde33d9c234eece9f318e087920255af3c05864cd3e7/llguidance-1.3.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:f7685222660a762e481ac633d49cc559c64980fe2ee59c8f932a5bb5cbc0c2c2", size = 3220647, upload-time = "2025-10-20T19:58:42.542Z" }, - { url = "https://files.pythonhosted.org/packages/82/e6/b48bda5b15efeaeb62bd0dba8fc6a01d4ae5457a85dbb5d18632385fe15c/llguidance-1.3.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:098030ff0687261a3f1bd54cf21fe951fc861d56d37a0671250dd36677eaf224", size = 3099830, upload-time = "2025-10-20T19:58:40.826Z" }, - { url = "https://files.pythonhosted.org/packages/aa/11/44389d3d1526d7a5c38ffd587a5ebc61d7bee443ac1dea95f2089ad58f5f/llguidance-1.3.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6f6caca5d78db7f76e1fbb0fff8607b861c32d47fa3d5dee2fc49de27ee269df", size = 2835242, upload-time = "2025-10-20T19:58:34.518Z" }, - { url = "https://files.pythonhosted.org/packages/83/a8/1ff2bedb8f9acb46a2d2d603415d272bb622c142ea86f5b95445cc6e366c/llguidance-1.3.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc17e9dd602c3879bf91664a64bf72f54c74dbfbeb24ccfab6a5fe435b12f7aa", size = 3033133, upload-time = "2025-10-20T19:58:38.721Z" }, - { url = "https://files.pythonhosted.org/packages/5a/7e/809349638231f469b9056c0e1bfd924d5ef5558b3b3ec72d093b6fad33b1/llguidance-1.3.0-cp39-abi3-win_amd64.whl", hash = "sha256:1d1cd1c8618d1a13605d3e057c978651e551c8c469b481ee4041f1d6c436002d", size = 2789946, upload-time = "2025-10-20T19:58:45.958Z" }, + { url = "https://files.pythonhosted.org/packages/2c/f1/818e93b059bf00219cfcaa1157b176492e03a5f488cfe159566f7e6e5fde/llguidance-1.7.3-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:e2eeeeb54bb033bc070c828eba6a0644644756a6e4ce4898d5cd79caf2462390", size = 3240288, upload-time = "2026-04-20T21:14:42.984Z" }, + { url = "https://files.pythonhosted.org/packages/6b/c9/896f56f36673b230d32af473d957dfd8e979d66879ebf2eaf699257f572f/llguidance-1.7.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:fe42358c2fb476b69789c555743e0d12fb354d6ddf8bbd2c669c710ee4e2cbb2", size = 3144890, upload-time = "2026-04-20T21:14:44.868Z" }, + { url = "https://files.pythonhosted.org/packages/6c/ea/ddc889167111d00a91cb4f9b6cbc9e451786439a13c5522b01be479ae01c/llguidance-1.7.3-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc74a9390418d9a2ed7811f243eb1417842aa23a850233914311a51bbcea29ee", size = 3470103, upload-time = "2026-04-20T21:14:46.624Z" }, + { url = "https://files.pythonhosted.org/packages/d4/f3/79db5a135f5d587d0f589004e2352f77f127cc92db352dba3904685ad88b/llguidance-1.7.3-cp314-cp314t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c578a077a7b87c9ef57717dc9fa61513cbb3ebb30c5b58acd0c18b2fb627187", size = 3760709, upload-time = "2026-04-20T21:14:48.191Z" }, + { url = "https://files.pythonhosted.org/packages/0d/42/9542dae2efcbd83940bd9dbe7b65ad09342608d509535835c005b32d0a0a/llguidance-1.7.3-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1cbf32aa6cfbf0fefde5b8ea98903b1c29f4ac8216c082c740f7592618fff7f8", size = 3490209, upload-time = "2026-04-20T21:14:50.138Z" }, + { url = "https://files.pythonhosted.org/packages/db/3a/fd453c7df03633f35c0e6ef4b9679df28ea6cd9983e4480710b710d2f834/llguidance-1.7.3-cp314-cp314t-win32.whl", hash = "sha256:d3ae30aa74ebb9727ad2649fbcdabf55c6dd4c7dd424094539f8508d780f70e8", size = 2600881, upload-time = "2026-04-20T21:14:51.636Z" }, + { url = "https://files.pythonhosted.org/packages/bd/7d/82ca82290f80a89de9fb6d64f9208917e13aa0a114619ffea0647c1cdbe6/llguidance-1.7.3-cp314-cp314t-win_amd64.whl", hash = "sha256:418f34ff6e1ec96cab89b13dcc15b1a696eb36aba90fe49e3fa06b593ddedb28", size = 2867893, upload-time = "2026-04-20T21:14:53.493Z" }, + { url = "https://files.pythonhosted.org/packages/29/19/31349c112f1dd62e2e1613dd5fee10419fd67ca3497332b802fc98a9b4f6/llguidance-1.7.3-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:d441049d75286f60d55b28a73483ad02ffb742ef9b33ac8efa98b29c698897ec", size = 3248781, upload-time = "2026-04-20T21:14:55.023Z" }, + { url = "https://files.pythonhosted.org/packages/a7/d6/e8a6b4a17a0ddcd19dd522bc0e479939d3775456ba15116fc8cf51a3b1b0/llguidance-1.7.3-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:76aced208b63d732f15cc848765023c9e91e492f80ed3458b407f023a07761ff", size = 3149797, upload-time = "2026-04-20T21:14:56.9Z" }, + { url = "https://files.pythonhosted.org/packages/31/c5/dc9156786739a267978b0b5e593b1d61e92db97f76b00794f625def8a176/llguidance-1.7.3-cp39-abi3-manylinux_2_31_aarch64.whl", hash = "sha256:1a35f8296159d7cecc488e702f698658da4b6a0eeca421f1f1bf1349d00bb0c7", size = 2891142, upload-time = "2026-04-20T21:14:58.747Z" }, + { url = "https://files.pythonhosted.org/packages/4f/c5/4564d131a02caefcf4876c26c33d05a35fddf53980b4c86fce31fdb27088/llguidance-1.7.3-cp39-abi3-manylinux_2_31_x86_64.whl", hash = "sha256:50dd687a5d944c898dfe66af7af86c7591a0425fe84c4862b8f5549582106732", size = 3083627, upload-time = "2026-04-20T21:15:00.592Z" }, + { url = "https://files.pythonhosted.org/packages/4b/05/201350217c781d6ad131af4fbb5644a551de1b9012a734bccc44300495f9/llguidance-1.7.3-cp39-abi3-manylinux_2_34_i686.whl", hash = "sha256:87b2019ff00b463558f6731b82246095bab202c45291a6038933bc75efbc674a", size = 3341351, upload-time = "2026-04-20T21:15:02.243Z" }, + { url = "https://files.pythonhosted.org/packages/97/50/f89f4ba15ead1e472ace2b919adf28e9fb88334f6c5796e46ee1937def7c/llguidance-1.7.3-cp39-abi3-manylinux_2_39_riscv64.whl", hash = "sha256:0ca0ba2985a09a74c6572d72d0f1885c90f01276b079f32ab0fbe41818dfc955", size = 3611737, upload-time = "2026-04-20T21:15:04.264Z" }, + { url = "https://files.pythonhosted.org/packages/59/b2/bd822432b8be03cc610790708ba6516a02dd4d1090f3dad95c74d8cdc77b/llguidance-1.7.3-cp39-abi3-win32.whl", hash = "sha256:ff5c6e16727fb1d72609600b8ec94023a57a13799583b9ca77f8de56c4425df9", size = 2613685, upload-time = "2026-04-20T21:15:05.738Z" }, + { url = "https://files.pythonhosted.org/packages/3e/97/2a488c3e696e3fb22e0bc5e07248736626f7ad92c0caf9d88dbc7866c6bd/llguidance-1.7.3-cp39-abi3-win_amd64.whl", hash = "sha256:c2f3d5f369fb74dc7ecdc4f686b15ec5522c6b81a903024078f9a6ab9b2dc1f4", size = 2873297, upload-time = "2026-04-20T21:15:07.451Z" }, ] [[package]] @@ -4657,7 +4668,7 @@ dependencies = [ { name = "ijson" }, { name = "intel-openmp", marker = "platform_machine == 'x86_64'" }, { name = "lark" }, - { name = "llguidance", marker = "platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'ppc64le' or platform_machine == 's390x' or platform_machine == 'x86_64'" }, + { name = "llguidance" }, { name = "lm-format-enforcer" }, { name = "mcp" }, { name = "mistral-common", extra = ["image"] },