diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 960bbe744..a30445950 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -84,7 +84,7 @@ jobs:
           # Lower bound support
           - vllm_version:
               name: "vLLM:lowest"
-              repo: "git+https://github.com/vllm-project/vllm --tag v0.11.0"
+              repo: "git+https://github.com/vllm-project/vllm --tag v0.17.0"
             test_suite:
               name: "backward compat"
               markers: "compat or (cpu and basic and not quantized and not sb)"
@@ -94,86 +94,9 @@ jobs:
             os: "ubuntu-latest"
             python_version: "3.12"
           # Intermediate versions of vllm to check basic support for as well
-          - vllm_version:
-              name: "vLLM:0.11.1"
-              repo: "git+https://github.com/vllm-project/vllm --tag v0.11.1"
-            test_suite:
-              name: "backward compat"
-              markers: "cpu and basic and not quantized and not sb"
-              flags: "--timeout=300"
-              hf_model_2: "sentence-transformers/all-roberta-large-v1"
-              hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
-            os: "ubuntu-latest"
-            python_version: "3.12"
-          - vllm_version:
-              name: "vLLM:0.11.2"
-              repo: "git+https://github.com/vllm-project/vllm --tag v0.11.2"
-            test_suite:
-              name: "backward compat"
-              markers: "cpu and basic and not quantized and not sb"
-              flags: "--timeout=300"
-              hf_model_2: "sentence-transformers/all-roberta-large-v1"
-              hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
-            os: "ubuntu-latest"
-            python_version: "3.12"
-          - vllm_version:
-              name: "vLLM:0.12.0"
-              repo: "git+https://github.com/vllm-project/vllm --tag v0.12.0"
-            test_suite:
-              name: "backward compat"
-              markers: "cpu and basic and not quantized and not sb"
-              flags: "--timeout=300"
-              hf_model_2: "sentence-transformers/all-roberta-large-v1"
-              hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
-            os: "ubuntu-latest"
-            python_version: "3.12"
-          - vllm_version:
-              name: "vLLM:0.13.0"
-              repo: "git+https://github.com/vllm-project/vllm --tag v0.13.0"
-            test_suite:
-              name: "backward compat"
-              markers: "cpu and basic and not quantized and not sb"
-              flags: "--timeout=300"
-              hf_model_2: "sentence-transformers/all-roberta-large-v1"
-              hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
-            os: "ubuntu-latest"
-            python_version: "3.12"
-          - vllm_version:
-              name: "vLLM:0.14.0"
-              repo: "git+https://github.com/vllm-project/vllm --tag v0.14.0"
-            test_suite:
-              name: "backward compat"
-              markers: "cpu and basic and not quantized and not sb"
-              flags: "--timeout=300"
-              hf_model_2: "sentence-transformers/all-roberta-large-v1"
-              hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
-            os: "ubuntu-latest"
-            python_version: "3.12"
-          - vllm_version:
-              name: "vLLM:0.14.1"
-              repo: "git+https://github.com/vllm-project/vllm --tag v0.14.1"
-            test_suite:
-              name: "backward compat"
-              markers: "cpu and basic and not quantized and not sb"
-              flags: "--timeout=300"
-              hf_model_2: "sentence-transformers/all-roberta-large-v1"
-              hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
-            os: "ubuntu-latest"
-            python_version: "3.12"
-          - vllm_version:
-              name: "vLLM:0.15.0"
-              repo: "git+https://github.com/vllm-project/vllm --tag v0.15.0"
-            test_suite:
-              name: "backward compat"
-              markers: "cpu and basic and not quantized and not sb"
-              flags: "--timeout=300"
-              hf_model_2: "sentence-transformers/all-roberta-large-v1"
-              hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
-            os: "ubuntu-latest"
-            python_version: "3.12"
           - vllm_version:
               name: "vLLM:0.15.1"
-              repo: "git+https://github.com/vllm-project/vllm --tag v0.15.1"
+              repo: "git+https://github.com/vllm-project/vllm --tag v0.17.1"
             test_suite:
               name: "backward compat"
               markers: "cpu and basic and not quantized and not sb"
diff --git a/pyproject.toml b/pyproject.toml
index 9d2883319..736a67f6d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,7 @@ license = {text = "Apache 2"}
 dependencies = [
     "fms-model-optimizer[fp8]>=0.8.0",
     "ibm-fms>=1.7.0,<2.0",
-    "vllm>=0.11.0,<0.16.1",
+    "vllm>=0.17.0,<0.18.1",
 ]
 requires-python = ">=3.11"
 dynamic = ["version"]
@@ -70,7 +70,7 @@ environments = [
 ]
 
 [tool.uv.sources]
-vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.16.0" }
+vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.18.0" }
 
 [tool.ty.rules]
 possibly-missing-attribute = "ignore"
diff --git a/tests/conftest.py b/tests/conftest.py
index 3e38915ab..ef6fb48b6 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -214,6 +214,8 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
     SpyrePlatform._used_with_cli = False
     yield
     if should_do_global_cleanup_after_test:
+        # Workaround torch.accelerator.empty_cache for torch 2.7.1 and vllm v0.18.0 compatibility
+        setattr(torch.accelerator, "empty_cache", lambda: None)  # noqa
         cleanup_dist_env_and_memory()
 
 
diff --git a/tests/e2e/test_chunked_prefill_tkv_steps.py b/tests/e2e/test_chunked_prefill_tkv_steps.py
index 7caf53ffb..7a7896101 100644
--- a/tests/e2e/test_chunked_prefill_tkv_steps.py
+++ b/tests/e2e/test_chunked_prefill_tkv_steps.py
@@ -119,9 +119,6 @@ def make_scheduler_output(
         scheduled_cached_reqs=scheduled_cached_reqs,
         num_scheduled_tokens=num_scheduled_tokens,
         total_num_scheduled_tokens=total_tokens,
-        scheduled_spec_decode_tokens={},
-        scheduled_encoder_inputs={},
-        num_common_prefix_blocks=[],
         finished_req_ids=finished_req_ids,
         kv_connector_metadata=None,
         **extra_args,
@@ -134,7 +131,6 @@ def make_new_request_data(req_id, prompt_len):
         prompt_token_ids=[42] * prompt_len,
         sampling_params=SamplingParams(),
         pooling_params=None,
-        eos_token_id=None,
     )
     return NewRequestData.from_request(req, block_ids=[])
 
diff --git a/tests/scheduling_utils.py b/tests/scheduling_utils.py
index 6a0a2c815..59326b618 100644
--- a/tests/scheduling_utils.py
+++ b/tests/scheduling_utils.py
@@ -120,7 +120,6 @@ def create_request_for_scheduler_test(
         request_id=str(request_id),
         sampling_params=sampling_params,
         prompt_token_ids=prompt,
-        eos_token_id=None,
         arrival_time=0,
         lora_request=None,
         pooling_params=None,
diff --git a/tests/spyre_util.py b/tests/spyre_util.py
index 50df7d71a..fe11dea7b 100644
--- a/tests/spyre_util.py
+++ b/tests/spyre_util.py
@@ -22,13 +22,8 @@
 from vllm_spyre.platform import SpyrePlatform
 from vllm_spyre import envs
 
-try:
-    # old
-    from vllm.utils import FlexibleArgumentParser, get_open_port
-except ImportError:
-    # new
-    from vllm.utils.argparse_utils import FlexibleArgumentParser
-    from vllm.utils.network_utils import get_open_port
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.network_utils import get_open_port
 
 from vllm.v1.request import Request
 
@@ -448,7 +443,6 @@ def create_random_request(
         request_id=str(request_id),
         prompt_token_ids=prompt_token_ids,
         sampling_params=sampling_params,
-        eos_token_id=None,
         arrival_time=0,
         lora_request=None,
         pooling_params=None,
diff --git a/tests/utils/test_cli_args.py b/tests/utils/test_cli_args.py
index 34996445b..9c1420637 100644
--- a/tests/utils/test_cli_args.py
+++ b/tests/utils/test_cli_args.py
@@ -10,12 +10,7 @@
 from vllm_spyre.config.model_registry import get_model_registry
 from spyre_util import environ_checkpoint, REFERENCE_MODELS
 
-try:
-    # old
-    from vllm.utils import FlexibleArgumentParser
-except ImportError:
-    # new
-    from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 global_default = 192
 
@@ -69,8 +64,6 @@ def sendnn_configured() -> bool:
         "32",
         "-tp",
         "4",
-        "--swap-space",  # to prevent a validation error in the 16GB memory test env.
-        "1",
     ]
 
     if model_name == "ibm-granite/granite-3.3-8b-instruct":
diff --git a/tests/utils/test_platform_validation.py b/tests/utils/test_platform_validation.py
index 36cb078c7..1f1a11653 100644
--- a/tests/utils/test_platform_validation.py
+++ b/tests/utils/test_platform_validation.py
@@ -4,8 +4,11 @@
 from SamplingParams during request validation.
 """
 
+from unittest.mock import MagicMock
 import pytest
+
 from vllm import SamplingParams
+from vllm.inputs.data import token_inputs
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import StructuredOutputsParams
 from vllm_spyre.platform import SpyrePlatform
@@ -13,6 +16,17 @@
 pytestmark = pytest.mark.skip_global_cleanup
 
 
+@pytest.fixture(autouse=True)
+def mock_spyre_config():
+    """Mock SpyrePlatform._config for all tests."""
+    original_config = SpyrePlatform._config
+    mock_config = MagicMock()
+    mock_config.model_config.max_model_len = 512
+    SpyrePlatform._config = mock_config
+    yield mock_config
+    SpyrePlatform._config = original_config
+
+
 class TestStructuredOutputValidation:
     """Test that platform validation strips structured outputs from requests."""
 
@@ -24,7 +38,8 @@ def test_strips_structured_outputs(self):
 
         assert params.structured_outputs is not None
 
-        SpyrePlatform.validate_request("Test prompt", params)
+        processed_inputs = token_inputs(prompt_token_ids=[1, 2, 3])
+        SpyrePlatform.validate_request(processed_inputs, params)
 
         assert params.structured_outputs is None
 
@@ -34,7 +49,8 @@ def test_logs_warning_when_stripping(self, caplog_vllm_spyre):
             max_tokens=20, structured_outputs=StructuredOutputsParams(json_object=True)
         )
 
-        SpyrePlatform.validate_request("Test prompt", params)
+        processed_inputs = token_inputs(prompt_token_ids=[1, 2, 3])
+        SpyrePlatform.validate_request(processed_inputs, params)
 
         assert len(caplog_vllm_spyre.records) > 0
         warning_record = caplog_vllm_spyre.records[0]
@@ -55,7 +71,8 @@ def test_strips_different_structured_output_types(self, structured_output):
 
         assert params.structured_outputs is not None
 
-        SpyrePlatform.validate_request("Test prompt", params)
+        processed_inputs = token_inputs(prompt_token_ids=[1, 2, 3])
+        SpyrePlatform.validate_request(processed_inputs, params)
 
         assert params.structured_outputs is None
 
@@ -77,7 +94,8 @@ def test_preserves_other_sampling_params(self):
             "top_k": params.top_k,
         }
 
-        SpyrePlatform.validate_request("Test prompt", params)
+        processed_inputs = token_inputs(prompt_token_ids=[1, 2, 3])
+        SpyrePlatform.validate_request(processed_inputs, params)
 
         # Verify other params are unchanged
         assert params.max_tokens == original_values["max_tokens"]
@@ -92,7 +110,8 @@ def test_does_not_affect_pooling_params(self):
         pooling_params = PoolingParams()
 
         # Should not raise any errors and should return early
-        SpyrePlatform.validate_request("Test prompt", pooling_params)
+        processed_inputs = token_inputs(prompt_token_ids=[1, 2, 3])
+        SpyrePlatform.validate_request(processed_inputs, pooling_params)
 
         # PoolingParams don't have structured_outputs, so just verify no exception
         assert True  # If we got here, the early return worked
diff --git a/tests/utils/test_upstream_compatibility.py b/tests/utils/test_upstream_compatibility.py
index c73f64129..7cb64b261 100644
--- a/tests/utils/test_upstream_compatibility.py
+++ b/tests/utils/test_upstream_compatibility.py
@@ -1,164 +1,21 @@
+"""
+This file previously contained backwards compatibility tests for vLLM versions < 0.17.0.
+All backwards compatibility code has been removed as the minimum supported version is now v0.17.0.
+"""
+
 import os
 
 import pytest
-from vllm.v1.core.block_pool import BlockPool
-from vllm.v1.core.sched.output import SchedulerOutput
-from vllm.v1.core.single_type_kv_cache_manager import FullAttentionManager
-
-from vllm_spyre.compat_utils import dataclass_fields, has_argument
 
 pytestmark = pytest.mark.compat
 
 VLLM_VERSION = os.getenv("TEST_VLLM_VERSION", "default")
 
 
-def test_pin_memory_available():
-    if VLLM_VERSION == "vLLM:lowest":
-        try:
-            from vllm.utils import is_pin_memory_available  # # noqa #ty: ignore
-            from vllm.utils import make_tensor_with_pad  # # noqa #ty: ignore
-            from vllm.utils import init_cached_hf_modules  # # noqa #ty: ignore
-        except ImportError as e:
-            raise AssertionError(
-                "remove backwards compatibility imports for "
-                "is_pin_memory_available, "
-                "make_tensor_with_pad and init_cached_hf_modules"
-            ) from e
-
-
-def test_multi_modal_cache_stats():
-    if VLLM_VERSION == "vLLM:lowest":
-        # If this import succeeds then remove the backwards compatibility type
-        # def for MultiModalCacheStats
-        with pytest.raises(ImportError):
-            from vllm.v1.metrics.stats import MultiModalCacheStats  # # noqa #ty: ignore
-
-
-def test_v0_worker_base():
-    if VLLM_VERSION == "vLLM:lowest":
-        try:
-            from vllm.worker.worker_base import WorkerBase  # # noqa #ty: ignore
-        except ImportError as e:
-            raise AssertionError(
-                "remove the backwards compatibility code from the SpyreWorker initializer"
-            ) from e
-
-
-def test_structured_output_request_ids():
-    if VLLM_VERSION == "vLLM:lowest":
-        # Can remove "structured_output_request_ids" and "grammar_bitmask"
-        # from backwards compat
-        assert "structured_output_request_ids" in dataclass_fields(SchedulerOutput)
-
-
-def test_hash_block_size():
-    if VLLM_VERSION == "vLLM:lowest":
-        # Can supply `hash_block_size` everywhere, this was added in 0.12.0
-        assert not has_argument(BlockPool, "hash_block_size")
-
-
-def test_alignment_tokens():
-    if VLLM_VERSION == "vLLM:lowest":
-        # Can supply `alignment_tokens` everywhere, this was added in 0.12.0
-        assert not has_argument(FullAttentionManager.find_longest_cache_hit, "alignment_tokens")
-
-
-def test_argparse_utils():
-    if VLLM_VERSION == "vLLM:lowest":
-        try:
-            from vllm.utils import FlexibleArgumentParser  # noqa
-        except ImportError as e:
-            raise AssertionError(
-                "Fix backward compatible imports of "
-                "FlexibleArgumentParser which is no longer required"
-            ) from e
-
-
-def test_pooler_api():
-    if VLLM_VERSION == "vLLM:lowest":
-        try:
-            from vllm.model_executor.layers.pooler import ClassifierPooler, Pooler  # noqa
-        except ImportError as e:
-            raise AssertionError(
-                "Backwards compatibility code for old pooler API "
-                "ClassifierPooler no longer required, related to vLLM PR #31973"
-            ) from e
-
-
-def test_set_random_seed():
-    if VLLM_VERSION == "vLLM:lowest":
-        try:
-            from vllm.model_executor import set_random_seed  # noqa
-        except ImportError as e:
-            raise AssertionError(
-                "Backwards compatibility code for set_random_seed import no longer required"
-            ) from e
-
-
-def test_enable_caching():
-    if VLLM_VERSION == "vLLM:lowest":
-        # Can supply enable_caching everywhere, added in v0.14.0
-        assert not has_argument(FullAttentionManager.__init__, "enable_caching"), (
-            "Backwards compatibility code for enable_caching parameter "
-            "in FullAttentionManager no longer required"
-        )
-
-
-def test_pooling_metadata_build_cursor():
-    if VLLM_VERSION == "vLLM:lowest":
-        from vllm.v1.pool.metadata import PoolingMetadata
-
-        assert has_argument(PoolingMetadata.build_pooling_cursor, "num_scheduled_tokens"), (
-            "Backwards compatibility code for num_scheduled_tokens parameter "
-            "in PoolingMetadata.build_pooling_cursor no longer required "
-        )
-
-
-def test_allocate_new_computed_blocks():
-    if VLLM_VERSION == "vLLM:lowest":
-        # allocate_new_computed_blocks was added in v0.14.0
-        # When save_new_computed_blocks no longer exists, remove the
-        # try/except compatibility code in spyre_model_runner.py
-        assert hasattr(FullAttentionManager, "save_new_computed_blocks"), (
-            "Backwards compatibility code for save_new_computed_blocks "
-            "in FullAttentionManager no longer required, can use "
-            "allocate_new_computed_blocks everywhere"
-        )
-
-
-def test_allocate_new_blocks_new_arg():
-    if VLLM_VERSION == "vLLM:lowest":
-        # allocate_new_blocks added an argument in v0.15.0
-        # When that is our lowest, we can remove compat code that checks for the
-        # num_tokens_main_model argument (see _allocate_new_blocks_wrapper in
-        # spyre_model_runner.py)
-        assert not hasattr(FullAttentionManager.allocate_new_blocks, "num_tokens_main_model"), (
-            "Backwards compatibility code checking existence of "
-            "num_tokens_main_model argument to allocate_new_blocks "
-            "in FullAttentionManager no longer required"
-        )
-
-
-def test_profiler_config():
-    if VLLM_VERSION == "vLLM:lowest":
-        # When ProfilerConfig exists in lowest version, remove env var fallback
-        with pytest.raises(ImportError):
-            from vllm.profiler.wrapper import TorchProfilerWrapper  # noqa
-
-
-def test_multimodal_field_elem_signature():
-    if VLLM_VERSION == "vLLM:lowest":
-        from vllm.multimodal.inputs import MultiModalFieldElem
-
-        # When modality parameter is removed in lowest version, remove compat code
-        assert has_argument(MultiModalFieldElem.__init__, "modality"), (
-            "Backwards compatibility code for MultiModalFieldElem modality/key "
-            "parameters no longer required"
-        )
-
-
-def test_dict_prompt_tok_prompt():
-    if VLLM_VERSION == "vLLM:lowest":
-        # When these types exist in lowest version, remove try/except imports
-        with pytest.raises(ImportError):
-            from vllm.renderers.inputs import DictPrompt, TokPrompt  # noqa
+def test_minimum_version_is_017():
+    """
+    Verify that the minimum vLLM version is 0.17.0.
+    All backwards compatibility code for versions < 0.17.0 has been removed.
+    """
+    # This test serves as documentation that v0.17.0 is the minimum supported version
+    assert True, "Minimum vLLM version is now 0.17.0"
diff --git a/tests/v1/core/test_scheduler_structured_outputs.py b/tests/v1/core/test_scheduler_structured_outputs.py
index 4282de668..27ced6b3b 100644
--- a/tests/v1/core/test_scheduler_structured_outputs.py
+++ b/tests/v1/core/test_scheduler_structured_outputs.py
@@ -73,7 +73,6 @@ def test_scheduler_strips_structured_output_request(self, mocked_scheduler, capl
             request_id="test_req",
             sampling_params=sampling_params,
             prompt_token_ids=list(range(50)),
-            eos_token_id=None,
             arrival_time=0,
             lora_request=None,
             pooling_params=None,
@@ -110,7 +109,6 @@ def test_scheduler_handles_request_without_structured_output(self, mocked_schedu
             request_id="test_req",
             sampling_params=sampling_params,
             prompt_token_ids=list(range(50)),
-            eos_token_id=None,
             arrival_time=0,
             lora_request=None,
             pooling_params=None,
@@ -146,7 +144,6 @@ def test_scheduler_handles_multiple_requests_with_structured_outputs(
                 request_id=f"test_req_{i}",
                 sampling_params=sampling_params,
                 prompt_token_ids=list(range(50)),
-                eos_token_id=None,
                 arrival_time=i,
                 lora_request=None,
                 pooling_params=None,
@@ -189,7 +186,6 @@ def test_scheduler_only_strips_when_can_schedule_prefill_true(self, mocked_sched
             request_id="test_req",
             sampling_params=sampling_params,
             prompt_token_ids=list(range(50)),
-            eos_token_id=None,
             arrival_time=0,
             lora_request=None,
             pooling_params=None,
@@ -224,7 +220,6 @@ def test_scheduler_preserves_other_request_attributes(
             request_id="test_req",
             sampling_params=sampling_params,
             prompt_token_ids=list(range(50)),
-            eos_token_id=100,
             arrival_time=1.5,
             lora_request=None,
             pooling_params=None,
@@ -233,7 +228,6 @@ def test_scheduler_preserves_other_request_attributes(
         # Store original values
         original_request_id = request.request_id
         original_prompt_tokens = list(request.prompt_token_ids) if request.prompt_token_ids else []
-        original_eos_token = request.eos_token_id
         original_arrival_time = request.arrival_time
         original_sampling_params = request.sampling_params
 
@@ -245,7 +239,6 @@ def test_scheduler_preserves_other_request_attributes(
         # Verify other attributes are unchanged
         assert request.request_id == original_request_id
         assert request.prompt_token_ids == original_prompt_tokens
-        assert request.eos_token_id == original_eos_token
         assert request.arrival_time == original_arrival_time
         assert request.sampling_params is original_sampling_params
         # But structured_output_request should be None
diff --git a/tests/v1/worker/mock_model.py b/tests/v1/worker/mock_model.py
index 8694415a1..ee3880e92 100644
--- a/tests/v1/worker/mock_model.py
+++ b/tests/v1/worker/mock_model.py
@@ -1,4 +1,3 @@
-from dataclasses import fields
 from typing import Any
 
 import pytest
@@ -147,12 +146,7 @@ def _schedule_new_request(self, request: Request, tokens_to_schedule: int) -> Sc
             scheduled_cached_reqs=CachedRequestData.make_empty(),
             num_scheduled_tokens=num_scheduled_tokens,
             total_num_scheduled_tokens=tokens_to_schedule,
-            scheduled_spec_decode_tokens={},
-            scheduled_encoder_inputs={},
-            num_common_prefix_blocks=[],
-            finished_req_ids=set(),
-            free_encoder_mm_hashes=[],
-            **self._compat_sched_output_kwargs(),
+            **self._extra_sched_output_kwargs(),
         )
 
     def _schedule_running_requests(
@@ -161,13 +155,9 @@ def _schedule_running_requests(
         num_computed_tokens: list[int],
         tokens_to_schedule: list[int],
     ) -> SchedulerOutput:
-        cached_reqs = CachedRequestData(
-            req_ids=req_ids,
-            new_token_ids=[],
-            new_block_ids=[],
-            num_computed_tokens=num_computed_tokens,
-            **self._compat_request_data_kwargs(),
-        )
+        cached_reqs = CachedRequestData.make_empty()
+        cached_reqs.req_ids = req_ids
+        cached_reqs.num_computed_tokens = num_computed_tokens
 
         num_scheduled_tokens = {}
         total_num_scheduled_tokens = 0
@@ -180,35 +170,18 @@ def _schedule_running_requests(
             scheduled_cached_reqs=cached_reqs,
             num_scheduled_tokens=num_scheduled_tokens,
             total_num_scheduled_tokens=total_num_scheduled_tokens,
-            scheduled_spec_decode_tokens={},
-            scheduled_encoder_inputs={},
-            num_common_prefix_blocks=[],
-            finished_req_ids=set(),
-            free_encoder_mm_hashes=[],
-            **self._compat_sched_output_kwargs(),
+            **self._extra_sched_output_kwargs(),
         )
 
-    def _compat_sched_output_kwargs(self) -> dict[str, Any]:
-        field_names = [field.name for field in fields(SchedulerOutput)]
-        kwargs: dict[str, Any] = {}
-        if "structured_output_request_ids" in field_names:
-            kwargs["structured_output_request_ids"] = {}
-        if "grammar_bitmask" in field_names:
-            kwargs["grammar_bitmask"] = None
-        return kwargs
-
-    def _compat_request_data_kwargs(self) -> dict[str, Any]:
-        field_names = [field.name for field in fields(CachedRequestData)]
-        kwargs: dict[str, Any] = {}
-        if "resumed_req_ids" in field_names:
-            kwargs["resumed_req_ids"] = set()
-        if "all_token_ids" in field_names:
-            kwargs["all_token_ids"] = {}
-        if "num_output_tokens" in field_names:
-            kwargs["num_output_tokens"] = {}
-        if "resumed_from_preemption" in field_names:
-            kwargs["resumed_from_preemption"] = []
-        return kwargs
+    def _extra_sched_output_kwargs(self) -> dict[str, Any]:
+        """Common kwargs for SchedulerOutput construction"""
+        return {
+            "scheduled_spec_decode_tokens": {},
+            "scheduled_encoder_inputs": {},
+            "num_common_prefix_blocks": [],
+            "finished_req_ids": set(),
+            "free_encoder_mm_hashes": [],
+        }
 
     def assert_block_tables_and_slot_mappings(
         self,
diff --git a/tests/v1/worker/test_prefix_caching_worker.py b/tests/v1/worker/test_prefix_caching_worker.py
index 1db88ebb3..1234b1ea6 100644
--- a/tests/v1/worker/test_prefix_caching_worker.py
+++ b/tests/v1/worker/test_prefix_caching_worker.py
@@ -4,7 +4,6 @@
 from v1.worker.mock_model import InstrumentedModelRunner
 
 from spyre_util import REFERENCE_MODELS
-from vllm_spyre.compat_utils import has_argument
 
 
 @pytest.mark.cpu
@@ -51,11 +50,9 @@ def test_block_sharing_for_2_chunks(
 
     kv_cache_manager = pc_model_runner.kv_cache_manager
 
-    # compat: vLLM 0.15.0 added an argument
-    if has_argument(kv_cache_manager.allocate_new_blocks, "num_tokens_main_model"):
-        kv_cache_manager.allocate_new_blocks(request1.request.request_id, 192, 192)
-    else:
-        kv_cache_manager.allocate_new_blocks(request1.request.request_id, 192)
+    kv_cache_manager.allocate_new_blocks(
+        request1.request.request_id, num_tokens=192, num_tokens_main_model=192
+    )
     kv_cache_manager.cache_blocks(request1.request, 192)
     kv_cache_manager.free(request1.request.request_id)
 
diff --git a/tests/v1/worker/test_spyre_input_batch.py b/tests/v1/worker/test_spyre_input_batch.py
index 9d62e27d3..eb8a142f4 100644
--- a/tests/v1/worker/test_spyre_input_batch.py
+++ b/tests/v1/worker/test_spyre_input_batch.py
@@ -6,11 +6,8 @@
 import torch
 from vllm.sampling_params import SamplingParams
 
-try:
-    from vllm.utils import is_pin_memory_available, make_tensor_with_pad
-except ImportError:
-    from vllm.utils.platform_utils import is_pin_memory_available
-    from vllm.utils.torch_utils import make_tensor_with_pad
+from vllm.utils.platform_utils import is_pin_memory_available
+from vllm.utils.torch_utils import make_tensor_with_pad
 
 from vllm.v1.sample.logits_processor import LogitsProcessors
 from vllm.v1.sample.metadata import SamplingMetadata
diff --git a/uv.lock b/uv.lock
index 204aa839c..e7b4ba1e4 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2,8 +2,12 @@ version = 1
 revision = 3
 requires-python = ">=3.11"
 resolution-markers = [
-    "python_full_version >= '3.12' and platform_machine != 'aarch64'",
-    "python_full_version >= '3.12' and platform_machine == 'aarch64'",
+    "python_full_version >= '3.14' and platform_machine != 'aarch64'",
+    "python_full_version == '3.13.*' and platform_machine != 'aarch64'",
+    "python_full_version == '3.12.*' and platform_machine != 'aarch64'",
+    "python_full_version >= '3.14' and platform_machine == 'aarch64'",
+    "python_full_version == '3.13.*' and platform_machine == 'aarch64'",
+    "python_full_version == '3.12.*' and platform_machine == 'aarch64'",
     "python_full_version < '3.12' and platform_machine != 'aarch64'",
     "python_full_version < '3.12' and platform_machine == 'aarch64'",
 ]
@@ -21,7 +25,7 @@ overrides = [
     { name = "torchaudio", marker = "sys_platform == 'never'" },
     { name = "torchvision", marker = "sys_platform == 'never'" },
     { name = "triton", marker = "sys_platform == 'never'" },
-    { name = "vllm", marker = "platform_machine not in 's390x, ppc64le'", git = "https://github.com/vllm-project/vllm?rev=v0.16.0" },
+    { name = "vllm", marker = "platform_machine not in 's390x, ppc64le'", git = "https://github.com/vllm-project/vllm?rev=v0.18.0" },
 ]
 
 [[package]]
@@ -1072,6 +1076,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fc/31/6a93a887617ee7deeaa602ca3d02d1c12a6cb8a742a695de5d128f5fa46a/gguf-0.17.1-py3-none-any.whl", hash = "sha256:7bc5aa7eeb1931f7d39b48fdc5b38fda6b294b9dca75cf607ac69557840a3943", size = 96224, upload-time = "2025-06-19T14:00:32.88Z" },
 ]
 
+[[package]]
+name = "googleapis-common-protos"
+version = "1.73.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "protobuf" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/99/96/a0205167fa0154f4a542fd6925bdc63d039d88dab3588b875078107e6f06/googleapis_common_protos-1.73.0.tar.gz", hash = "sha256:778d07cd4fbeff84c6f7c72102f0daf98fa2bfd3fa8bea426edc545588da0b5a", size = 147323, upload-time = "2026-03-06T21:53:09.727Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/69/28/23eea8acd65972bbfe295ce3666b28ac510dfcb115fac089d3edb0feb00a/googleapis_common_protos-1.73.0-py3-none-any.whl", hash = "sha256:dfdaaa2e860f242046be561e6d6cb5c5f1541ae02cfbcb034371aadb2942b4e8", size = 297578, upload-time = "2026-03-06T21:52:33.933Z" },
+]
+
 [[package]]
 name = "grpcio"
 version = "1.76.0"
@@ -1123,19 +1139,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/19/41/0b430b01a2eb38ee887f88c1f07644a1df8e289353b78e82b37ef988fb64/grpcio-1.76.0-cp314-cp314-win_amd64.whl", hash = "sha256:922fa70ba549fce362d2e2871ab542082d66e2aaf0c19480ea453905b01f384e", size = 4834462, upload-time = "2025-10-21T16:22:39.772Z" },
 ]
 
-[[package]]
-name = "grpcio-reflection"
-version = "1.76.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "grpcio" },
-    { name = "protobuf" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/bd/10/767f9c2719c435616141efb3371f6e158f95cdde36a34876ae1d08ba7440/grpcio_reflection-1.76.0.tar.gz", hash = "sha256:e0e7e49921c2ee951e5ddff0bdbacbd1ac1a70888beb61d567f3d01b799decb1", size = 18845, upload-time = "2025-10-21T16:28:57.776Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/32/af/6168cf4ff389deed1388b1196281c67cb36dbbf44aaee40e2bfb72ac0202/grpcio_reflection-1.76.0-py3-none-any.whl", hash = "sha256:d7c43f2047a2a9c9320a5905aa7133c677977436b5f63e6a868e507864a11c73", size = 22702, upload-time = "2025-10-21T16:27:40.846Z" },
-]
-
 [[package]]
 name = "h11"
 version = "0.16.0"
@@ -1366,6 +1369,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ec/f2/53b6e9bdd2a91202066764eaa74b572ba4dede0fe47a5a26f4de34b7541a/ijson-3.4.0.post0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:a0fedf09c0f6ffa2a99e7e7fd9c5f3caf74e655c1ee015a0797383e99382ebc3", size = 54657, upload-time = "2025-10-10T05:29:24.482Z" },
 ]
 
+[[package]]
+name = "importlib-metadata"
+version = "8.7.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "zipp" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f3/49/3b30cad09e7771a4982d9975a8cbf64f00d4a1ececb53297f1d9a7be1b10/importlib_metadata-8.7.1.tar.gz", hash = "sha256:49fef1ae6440c182052f407c8d34a68f72efc36db9ca90dc0113398f2fdde8bb", size = 57107, upload-time = "2025-12-21T10:00:19.278Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fa/5e/f8e9a1d23b9c20a551a8a02ea3637b4642e22c2626e3a13a9a29cdea99eb/importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151", size = 27865, upload-time = "2025-12-21T10:00:18.329Z" },
+]
+
 [[package]]
 name = "iniconfig"
 version = "2.3.0"
@@ -1753,7 +1768,7 @@ wheels = [
 
 [[package]]
 name = "mistral-common"
-version = "1.9.1"
+version = "1.10.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "jsonschema" },
@@ -1765,9 +1780,9 @@ dependencies = [
     { name = "tiktoken" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/db/ce/685b8127a326478e05501cb4c9ca23d1cd9f37e16c465a1e832c75aea709/mistral_common-1.9.1.tar.gz", hash = "sha256:550583d70a395c3586cfb748ffab53bd1d7c3409507f0efc0118bff30ffb26e9", size = 6338922, upload-time = "2026-02-12T10:53:41.639Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/a7/22/f798c1acc3f8cf32b6201b063d96867d79aa39d31dff12478739e1a78979/mistral_common-1.10.0.tar.gz", hash = "sha256:e456ff101edbdfc094039ec6c26f7d0f73356729798d628a6e6e96c3917147bc", size = 6351515, upload-time = "2026-03-13T10:13:46.683Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ac/72/a38bb1fd9fd4d4ef990341c9dd1a7c8061f1951e10efa6d50c0a3f04eced/mistral_common-1.9.1-py3-none-any.whl", hash = "sha256:9e2b2520b6f67bac2e2bb06fcf985b7a1277b01938da2b7cda8cf0fdbfa92e91", size = 6518623, upload-time = "2026-02-12T10:53:39.457Z" },
+    { url = "https://files.pythonhosted.org/packages/87/c6/1429a0a3ab40f8530492b62b52eb792266c261b22ed62aa7f25d61d531ae/mistral_common-1.10.0-py3-none-any.whl", hash = "sha256:c594d1a05202b61e8f0d867ec6064df4c5e5d492c2c2bdb6fd8fb4872c6afd8b", size = 6525284, upload-time = "2026-03-13T10:13:44.329Z" },
 ]
 
 [package.optional-dependencies]
@@ -1775,56 +1790,6 @@ image = [
     { name = "opencv-python-headless" },
 ]
 
-[[package]]
-name = "mlx"
-version = "0.30.3"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "mlx-metal", marker = "platform_machine != 'aarch64' and sys_platform == 'darwin'" },
-]
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/78/b6/dfcfffc41d832a86249715fab336dc8638c2237035287eb24af792484c53/mlx-0.30.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:794e79587a4906bdb3c5473ef936f45008eaaa609a3c498cc29a442b2c829621", size = 568664, upload-time = "2026-01-14T01:16:45.573Z" },
-    { url = "https://files.pythonhosted.org/packages/22/9f/22d494b83b611380063da31c2b482db8c620f7ad6531cfcd1e11f7c35852/mlx-0.30.3-cp311-cp311-macosx_15_0_arm64.whl", hash = "sha256:472cdc6eaca8610224621a1561e8c36477eab1a2f0dd3eb49b95484d739c4605", size = 568663, upload-time = "2026-01-14T01:16:46.588Z" },
-    { url = "https://files.pythonhosted.org/packages/0d/76/b6fb0500aef8e9ed65d4730d8c34b13d7a770ca863b9af363b5713a16040/mlx-0.30.3-cp311-cp311-macosx_26_0_arm64.whl", hash = "sha256:a5d82be69c7e671dc4d5855d2f6aedcb507817e5985478903ab754b642d9ba01", size = 568522, upload-time = "2026-01-14T05:52:08.334Z" },
-    { url = "https://files.pythonhosted.org/packages/11/b3/e24c3a69dad0cf4404bb174c6fed0d804022da64758cd815a254e1cd0627/mlx-0.30.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:0b275168b80645a155b456e1a457a37fb5ee2c251e8fbd8db9e153351a9e2d2f", size = 569398, upload-time = "2026-01-14T01:16:49.804Z" },
-    { url = "https://files.pythonhosted.org/packages/0b/87/d0804443da97a06d3439f6efb0ceffa178f530a121f0f4a6c77b39f8bfd7/mlx-0.30.3-cp312-cp312-macosx_15_0_arm64.whl", hash = "sha256:6e818de14864982e832344198240a1dafba7d3316c4eb6f1b8e43b4dd25dd2ef", size = 569396, upload-time = "2026-01-14T01:16:51.007Z" },
-    { url = "https://files.pythonhosted.org/packages/cf/dc/7cdd95e4561b73fba8c86bf11293797076120400e472fe2a72ef483b6d8d/mlx-0.30.3-cp312-cp312-macosx_26_0_arm64.whl", hash = "sha256:d23b422209fd4b7ecacef59070321f8c6a122f906a5e9b6683a5fc9e1b8fcd5c", size = 569192, upload-time = "2026-01-14T05:52:09.715Z" },
-    { url = "https://files.pythonhosted.org/packages/d0/22/42935d593fe82d3b98eb9d60e4620ed99703886635106f89d407c68f33bc/mlx-0.30.3-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:743fac1e4f9e8e46c8262943c643a31139c255cdb256c99ad496958215ccac1e", size = 569344, upload-time = "2026-01-14T01:16:54.847Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/27/f2e7a5236289d45315d0215e8553b4dd7e2faaba3bcb5025b34b25d5ab66/mlx-0.30.3-cp313-cp313-macosx_15_0_arm64.whl", hash = "sha256:3b04ae81655aa0e63a6e8f2c749de3bbce64cf5b168ae10f39ed086dfa99e7f8", size = 569345, upload-time = "2026-01-14T01:16:56.564Z" },
-    { url = "https://files.pythonhosted.org/packages/01/41/06b042457f51952456e9bb46b2c6e205ab3a28fc52d6751b5787fdb762b2/mlx-0.30.3-cp313-cp313-macosx_26_0_arm64.whl", hash = "sha256:ba9b5bdb1e929cc130af72efd7f73508c0f4e526d224489af7ec1c6419564659", size = 569213, upload-time = "2026-01-14T05:52:10.86Z" },
-    { url = "https://files.pythonhosted.org/packages/82/e2/6e551bd48fb350fbf0ee4cc5cd09485437d260b8f4937f22d8623e14687a/mlx-0.30.3-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:2c27fd8daaae14ca6cf407fcd236006a6e968f7708c8f61a2709116f2e754852", size = 571920, upload-time = "2026-01-14T01:16:59.683Z" },
-    { url = "https://files.pythonhosted.org/packages/82/c0/561d1c9d3d12830b0e7fdcbd807585ef20909e398d4bcdbf25e4367543eb/mlx-0.30.3-cp314-cp314-macosx_15_0_arm64.whl", hash = "sha256:b755fd4ed4b6a2ae4dee3766b5a2ea52fcbe83ebd1cf018458e18b74139409f3", size = 571921, upload-time = "2026-01-14T01:17:00.868Z" },
-    { url = "https://files.pythonhosted.org/packages/42/1a/fb573fc2edc22a777fa254ff5c0c886ffd2c88aeb1f21c45778ef170f990/mlx-0.30.3-cp314-cp314-macosx_26_0_arm64.whl", hash = "sha256:7e352c0369a2f7e54d4f317b434eab3333918ea9edde1c43c61d36386b6f76bf", size = 571732, upload-time = "2026-01-14T05:52:11.893Z" },
-]
-
-[[package]]
-name = "mlx-lm"
-version = "0.29.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "jinja2", marker = "platform_machine != 'aarch64'" },
-    { name = "mlx", marker = "platform_machine != 'aarch64' and sys_platform == 'darwin'" },
-    { name = "numpy", marker = "platform_machine != 'aarch64'" },
-    { name = "protobuf", marker = "platform_machine != 'aarch64'" },
-    { name = "pyyaml", marker = "platform_machine != 'aarch64'" },
-    { name = "sentencepiece", marker = "platform_machine != 'aarch64'" },
-    { name = "transformers", marker = "platform_machine != 'aarch64'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/e3/62/f46e1355256a114808517947f8e83ad6be310c7288c551db0fa678f47923/mlx_lm-0.29.1.tar.gz", hash = "sha256:b99180d8f33d33a077b814e550bfb2d8a59ae003d668fd1f4b3fff62a381d34b", size = 232302, upload-time = "2025-12-16T16:58:27.959Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/e5/53/913099c91d384e115ea078325efd9a0bc1ea3eb3458c694b4596cbd267f2/mlx_lm-0.29.1-py3-none-any.whl", hash = "sha256:440941b3054c2a2216e97615de584cc90fa1ea874782e20699b9895721fad8dc", size = 324884, upload-time = "2025-12-16T16:58:26.36Z" },
-]
-
-[[package]]
-name = "mlx-metal"
-version = "0.30.3"
-source = { registry = "https://pypi.org/simple" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/f6/63/4d8f6fefb507c028df4454dabfe8d8e0ad2961bb06510b6aca23d2d5b2be/mlx_metal-0.30.3-py3-none-macosx_14_0_arm64.whl", hash = "sha256:6276312b02353714c7c6515169569fe1c4bebe3229c8ecf1fdb375a13e78c966", size = 37716245, upload-time = "2026-01-14T01:16:34.838Z" },
-    { url = "https://files.pythonhosted.org/packages/35/91/1d452e48a4bb4958844fd3bb28ae31b8de110549c009ebec5024ce27ebf3/mlx_metal-0.30.3-py3-none-macosx_15_0_arm64.whl", hash = "sha256:c096c0a3428f3f96a06220f97a36f9528b18bc05173f821eb05bc8458e723fa8", size = 37712125, upload-time = "2026-01-14T01:16:38.619Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/36/7a3cbca85542b5ca4faf871e35927f43aa0e3fc830ae5b699780fe723677/mlx_metal-0.30.3-py3-none-macosx_26_0_arm64.whl", hash = "sha256:69068533bd1ee8b0379ce5de57ed5fd313577a10ecab58e1332fd1ff7248a75e", size = 46488962, upload-time = "2026-01-14T05:52:04.523Z" },
-]
-
 [[package]]
 name = "model-hosting-container-standards"
 version = "0.1.13"
@@ -2215,6 +2180,132 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f2/35/0858e9e71b36948eafbc5e835874b63e515179dc3b742cbe3d76bc683439/opencv_python_headless-4.12.0.88-cp37-abi3-win_amd64.whl", hash = "sha256:86b413bdd6c6bf497832e346cd5371995de148e579b9774f8eba686dee3f5528", size = 38923559, upload-time = "2025-07-07T09:15:25.229Z" },
 ]
 
+[[package]]
+name = "opentelemetry-api"
+version = "1.40.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "importlib-metadata" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/2c/1d/4049a9e8698361cc1a1aa03a6c59e4fa4c71e0c0f94a30f988a6876a2ae6/opentelemetry_api-1.40.0.tar.gz", hash = "sha256:159be641c0b04d11e9ecd576906462773eb97ae1b657730f0ecf64d32071569f", size = 70851, upload-time = "2026-03-04T14:17:21.555Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5f/bf/93795954016c522008da367da292adceed71cca6ee1717e1d64c83089099/opentelemetry_api-1.40.0-py3-none-any.whl", hash = "sha256:82dd69331ae74b06f6a874704be0cfaa49a1650e1537d4a813b86ecef7d0ecf9", size = 68676, upload-time = "2026-03-04T14:17:01.24Z" },
+]
+
+[[package]]
+name = "opentelemetry-exporter-otlp"
+version = "1.40.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-exporter-otlp-proto-grpc" },
+    { name = "opentelemetry-exporter-otlp-proto-http" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d0/37/b6708e0eff5c5fb9aba2e0ea09f7f3bcbfd12a592d2a780241b5f6014df7/opentelemetry_exporter_otlp-1.40.0.tar.gz", hash = "sha256:7caa0870b95e2fcb59d64e16e2b639ecffb07771b6cd0000b5d12e5e4fef765a", size = 6152, upload-time = "2026-03-04T14:17:23.235Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2d/fc/aea77c28d9f3ffef2fdafdc3f4a235aee4091d262ddabd25882f47ce5c5f/opentelemetry_exporter_otlp-1.40.0-py3-none-any.whl", hash = "sha256:48c87e539ec9afb30dc443775a1334cc5487de2f72a770a4c00b1610bf6c697d", size = 7023, upload-time = "2026-03-04T14:17:03.612Z" },
+]
+
+[[package]]
+name = "opentelemetry-exporter-otlp-proto-common"
+version = "1.40.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-proto" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/51/bc/1559d46557fe6eca0b46c88d4c2676285f1f3be2e8d06bb5d15fbffc814a/opentelemetry_exporter_otlp_proto_common-1.40.0.tar.gz", hash = "sha256:1cbee86a4064790b362a86601ee7934f368b81cd4cc2f2e163902a6e7818a0fa", size = 20416, upload-time = "2026-03-04T14:17:23.801Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8b/ca/8f122055c97a932311a3f640273f084e738008933503d0c2563cd5d591fc/opentelemetry_exporter_otlp_proto_common-1.40.0-py3-none-any.whl", hash = "sha256:7081ff453835a82417bf38dccf122c827c3cbc94f2079b03bba02a3165f25149", size = 18369, upload-time = "2026-03-04T14:17:04.796Z" },
+]
+
+[[package]]
+name = "opentelemetry-exporter-otlp-proto-grpc"
+version = "1.40.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "googleapis-common-protos" },
+    { name = "grpcio" },
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-exporter-otlp-proto-common" },
+    { name = "opentelemetry-proto" },
+    { name = "opentelemetry-sdk" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/8f/7f/b9e60435cfcc7590fa87436edad6822240dddbc184643a2a005301cc31f4/opentelemetry_exporter_otlp_proto_grpc-1.40.0.tar.gz", hash = "sha256:bd4015183e40b635b3dab8da528b27161ba83bf4ef545776b196f0fb4ec47740", size = 25759, upload-time = "2026-03-04T14:17:24.4Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/96/6f/7ee0980afcbdcd2d40362da16f7f9796bd083bf7f0b8e038abfbc0300f5d/opentelemetry_exporter_otlp_proto_grpc-1.40.0-py3-none-any.whl", hash = "sha256:2aa0ca53483fe0cf6405087a7491472b70335bc5c7944378a0a8e72e86995c52", size = 20304, upload-time = "2026-03-04T14:17:05.942Z" },
+]
+
+[[package]]
+name = "opentelemetry-exporter-otlp-proto-http"
+version = "1.40.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "googleapis-common-protos" },
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-exporter-otlp-proto-common" },
+    { name = "opentelemetry-proto" },
+    { name = "opentelemetry-sdk" },
+    { name = "requests" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/2e/fa/73d50e2c15c56be4d000c98e24221d494674b0cc95524e2a8cb3856d95a4/opentelemetry_exporter_otlp_proto_http-1.40.0.tar.gz", hash = "sha256:db48f5e0f33217588bbc00274a31517ba830da576e59503507c839b38fa0869c", size = 17772, upload-time = "2026-03-04T14:17:25.324Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a0/3a/8865d6754e61c9fb170cdd530a124a53769ee5f740236064816eb0ca7301/opentelemetry_exporter_otlp_proto_http-1.40.0-py3-none-any.whl", hash = "sha256:a8d1dab28f504c5d96577d6509f80a8150e44e8f45f82cdbe0e34c99ab040069", size = 19960, upload-time = "2026-03-04T14:17:07.153Z" },
+]
+
+[[package]]
+name = "opentelemetry-proto"
+version = "1.40.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "protobuf" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/4c/77/dd38991db037fdfce45849491cb61de5ab000f49824a00230afb112a4392/opentelemetry_proto-1.40.0.tar.gz", hash = "sha256:03f639ca129ba513f5819810f5b1f42bcb371391405d99c168fe6937c62febcd", size = 45667, upload-time = "2026-03-04T14:17:31.194Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b9/b2/189b2577dde745b15625b3214302605b1353436219d42b7912e77fa8dc24/opentelemetry_proto-1.40.0-py3-none-any.whl", hash = "sha256:266c4385d88923a23d63e353e9761af0f47a6ed0d486979777fe4de59dc9b25f", size = 72073, upload-time = "2026-03-04T14:17:16.673Z" },
+]
+
+[[package]]
+name = "opentelemetry-sdk"
+version = "1.40.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-semantic-conventions" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/58/fd/3c3125b20ba18ce2155ba9ea74acb0ae5d25f8cd39cfd37455601b7955cc/opentelemetry_sdk-1.40.0.tar.gz", hash = "sha256:18e9f5ec20d859d268c7cb3c5198c8d105d073714db3de50b593b8c1345a48f2", size = 184252, upload-time = "2026-03-04T14:17:31.87Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2c/c5/6a852903d8bfac758c6dc6e9a68b015d3c33f2f1be5e9591e0f4b69c7e0a/opentelemetry_sdk-1.40.0-py3-none-any.whl", hash = "sha256:787d2154a71f4b3d81f20524a8ce061b7db667d24e46753f32a7bc48f1c1f3f1", size = 141951, upload-time = "2026-03-04T14:17:17.961Z" },
+]
+
+[[package]]
+name = "opentelemetry-semantic-conventions"
+version = "0.61b0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-api" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/6d/c0/4ae7973f3c2cfd2b6e321f1675626f0dab0a97027cc7a297474c9c8f3d04/opentelemetry_semantic_conventions-0.61b0.tar.gz", hash = "sha256:072f65473c5d7c6dc0355b27d6c9d1a679d63b6d4b4b16a9773062cb7e31192a", size = 145755, upload-time = "2026-03-04T14:17:32.664Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b2/37/cc6a55e448deaa9b27377d087da8615a3416d8ad523d5960b78dbeadd02a/opentelemetry_semantic_conventions-0.61b0-py3-none-any.whl", hash = "sha256:fa530a96be229795f8cef353739b618148b0fe2b4b3f005e60e262926c4d38e2", size = 231621, upload-time = "2026-03-04T14:17:19.33Z" },
+]
+
+[[package]]
+name = "opentelemetry-semantic-conventions-ai"
+version = "0.5.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-sdk" },
+    { name = "opentelemetry-semantic-conventions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c6/0b/0ff2326417a9eed74ff6717629075246098dcbda067a62fd73095139babb/opentelemetry_semantic_conventions_ai-0.5.0.tar.gz", hash = "sha256:64c21c5ae0c971ee2ecab986d66e93bb50e616b52e18a1284e118a323a9e6869", size = 25202, upload-time = "2026-03-20T08:47:05.751Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a7/18/35fec29ed6e49bcbbe629b790cc0deb5bb58da9caceee29b39b54d3d7f47/opentelemetry_semantic_conventions_ai-0.5.0-py3-none-any.whl", hash = "sha256:8727f474f590138f5e4937945378878a5b2f4ea82bc24ffd93265ca9fbdc48a4", size = 9983, upload-time = "2026-03-20T08:47:06.843Z" },
+]
+
 [[package]]
 name = "outlines-core"
 version = "0.2.11"
@@ -4273,8 +4364,8 @@ wheels = [
 
 [[package]]
 name = "vllm"
-version = "0.16.0"
-source = { git = "https://github.com/vllm-project/vllm?rev=v0.16.0#89a77b10846fd96273cce78d86d2556ea582d26e" }
+version = "0.18.0+cpu"
+source = { git = "https://github.com/vllm-project/vllm?rev=v0.18.0#bcf2be96120005e9aea171927f85055a6a5c0cf6" }
 dependencies = [
     { name = "aiohttp" },
     { name = "anthropic" },
@@ -4289,8 +4380,6 @@ dependencies = [
     { name = "fastapi", extra = ["standard"] },
     { name = "filelock" },
     { name = "gguf" },
-    { name = "grpcio" },
-    { name = "grpcio-reflection" },
     { name = "ijson" },
     { name = "intel-openmp", marker = "platform_machine == 'x86_64'" },
     { name = "lark" },
@@ -4306,6 +4395,10 @@ dependencies = [
     { name = "openai" },
     { name = "openai-harmony" },
     { name = "opencv-python-headless" },
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-exporter-otlp" },
+    { name = "opentelemetry-sdk" },
+    { name = "opentelemetry-semantic-conventions-ai" },
     { name = "outlines-core" },
     { name = "partial-json-parser" },
     { name = "pillow" },
@@ -4362,7 +4455,7 @@ dev = [
 requires-dist = [
     { name = "fms-model-optimizer", extras = ["fp8"], specifier = ">=0.8.0" },
     { name = "ibm-fms", specifier = ">=1.7.0,<2.0" },
-    { name = "vllm", git = "https://github.com/vllm-project/vllm?rev=v0.16.0" },
+    { name = "vllm", git = "https://github.com/vllm-project/vllm?rev=v0.18.0" },
 ]
 
 [package.metadata.requires-dev]
@@ -4534,10 +4627,9 @@ wheels = [
 
 [[package]]
 name = "xgrammar"
-version = "0.1.29"
+version = "0.1.32"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "mlx-lm", marker = "platform_machine == 'arm64' and sys_platform == 'darwin'" },
     { name = "numpy" },
     { name = "pydantic" },
     { name = "torch", marker = "sys_platform == 'never'" },
@@ -4545,21 +4637,32 @@ dependencies = [
     { name = "triton", marker = "sys_platform == 'never'" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/02/a3/70dbe3ffd331a1e7e1ad5a95690a4086e6c7cdb8089f5c7eda712219ccec/xgrammar-0.1.29.tar.gz", hash = "sha256:cf195afa81b489eebf35d4c6f37f27136d05420739ab4a6f7f065c938d7e4baa", size = 2321317, upload-time = "2025-12-19T08:23:54.53Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c6/de/88832fac40962fd0d4703bd4ba84598b06b8408bdc4a6722744f363f68a6/xgrammar-0.1.29-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:d2a7eef1b75b8d31b868d5c79855622aad203275ff267fc0e0ef77dd91906cfe", size = 16008004, upload-time = "2025-12-19T08:23:11.998Z" },
-    { url = "https://files.pythonhosted.org/packages/76/f6/4d22eec5305657430955442077306bc6ed85becc564116165d4b3a7049ad/xgrammar-0.1.29-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4af7f6ce2b2c6295b936b7cbda09f78e33f2c492a139cd64560f5d8d0fe967ed", size = 17914326, upload-time = "2025-12-19T08:23:14.43Z" },
-    { url = "https://files.pythonhosted.org/packages/87/0b/b5e5c99ce13a9d378a940cda07c5a08b50cc7efb66936c6ac8fa8232a0d5/xgrammar-0.1.29-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51bcfd63bd48a0b26209ffd2143a42067518559355ec9e4e574cef2ae74fac7c", size = 34699408, upload-time = "2025-12-19T08:23:16.906Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/a0/4ebc1b3f5af79a3f73d0566034758f3fbcd9c64174646314a9a6f7cc1d27/xgrammar-0.1.29-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e27b50cf8c565845295a8263a4a0790c00a7c1fd783e76222fc0f575654d6f56", size = 34903461, upload-time = "2025-12-19T08:23:19.556Z" },
-    { url = "https://files.pythonhosted.org/packages/77/21/f6b3978dc9761bbfbbb153d33441206ce2253efa271d8e2d8b6b210d2bd7/xgrammar-0.1.29-cp311-cp311-win_amd64.whl", hash = "sha256:c9f8ea76bcf41b48168974b509b1546d2bee289ff1b20c68bc97434c1ea6e49a", size = 5928633, upload-time = "2025-12-19T08:23:21.67Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/d8/fb282fc78be6e9bbefb5cb389f66b22e4efd6ae14f06234f599651620da5/xgrammar-0.1.29-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:d992a3cee7594bbdaa64ae59f90da5ce21c5fe654719df3816014289ada6f04d", size = 16007376, upload-time = "2025-12-19T08:23:23.634Z" },
-    { url = "https://files.pythonhosted.org/packages/82/a7/2c9767620ee50f2f40f1eb95e55a3a29e1a0670f087ee6dc1bc1c887b906/xgrammar-0.1.29-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1bbdf02e45cfa8614218ba01ca7952d375f8bc1c13884e3d04daa4b54180cbc2", size = 17913535, upload-time = "2025-12-19T08:23:26.02Z" },
-    { url = "https://files.pythonhosted.org/packages/57/94/18793c64bf0368075a34c06e196bf002f1e6ab0aee332268f44e8d356d5a/xgrammar-0.1.29-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6eb370a16b27a683e5f2b9e429ab41440c69977d4a504849ed61831b94cc704c", size = 34705239, upload-time = "2025-12-19T08:23:28.369Z" },
-    { url = "https://files.pythonhosted.org/packages/3e/da/4c14e3e00be698009b52700f15326a23272b4b00475939b6acc86b151188/xgrammar-0.1.29-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:79e6e4f5cd33be77418cf91efc482f2b3d773d309891224383bc8a4948ad7b07", size = 34906135, upload-time = "2025-12-19T08:23:30.838Z" },
-    { url = "https://files.pythonhosted.org/packages/22/d8/34423997f48627cef3b74cc894d9dfcaacae02941c06237ac5f3196406a7/xgrammar-0.1.29-cp312-cp312-win_amd64.whl", hash = "sha256:39bdfadedbce34599835486164fa80ba00248c6c75ad91f3843db90ef37e037f", size = 5928381, upload-time = "2025-12-19T08:23:33.428Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/ef/8a4b4cb10fc996c0a25c9bf5613aaf5a86114291a9a4003e43605cab42bf/xgrammar-0.1.29-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:fedf21e447ef646f23a6e2d11877c0812d55965dcf8c0aa9b0f32590c9b6e22a", size = 17913609, upload-time = "2025-12-19T08:23:36.06Z" },
-    { url = "https://files.pythonhosted.org/packages/e9/c5/e4965c9921e7bb6061f246ae7f8c7b9b1dfc21262248100c2f9b398b361e/xgrammar-0.1.29-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eb22aea775971f7d8c4d0e193257ebeb71b68acd9d36af3331ca5fd4d9a46991", size = 34904126, upload-time = "2025-12-19T08:23:38.335Z" },
-    { url = "https://files.pythonhosted.org/packages/09/26/641d7ee1a59e526aa94be980c485f899088d09dd1af517a2e1d0e85853bc/xgrammar-0.1.29-cp313-cp313-win_amd64.whl", hash = "sha256:12e6d63e892e9da8d088569dd629af58a5eafd909dc58788d499c4fd74bcd2a1", size = 5928450, upload-time = "2025-12-19T08:23:40.667Z" },
+sdist = { url = "https://files.pythonhosted.org/packages/99/6a/d51b44fc0b43e2d4adae42b6a17fe9ee49e177d6d768be739ed7dec7b57e/xgrammar-0.1.32.tar.gz", hash = "sha256:5d424d52779ca2d3ccaf72f2289d6519efe308e933d0d3fc3c292c780825bb12", size = 2365047, upload-time = "2026-03-04T12:01:52.544Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/28/cd/4b5e67c8030b626a1a00b65b4d149b1b031c885eef86d4e5fa296f6ec72e/xgrammar-0.1.32-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:51b41c47785aa198d19f8d056b394f75b4421deab88c415568f9c588b1f7e238", size = 18425822, upload-time = "2026-03-04T12:00:23.356Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/c0/94fbc45642e733a9ad4a9f3f7300a1a06b265f8657af4d6a56acd8cf00c4/xgrammar-0.1.32-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d7030192cb1d8579699f1f72fd14d31347a402611aab98a2da6a04c3de07e917", size = 20582669, upload-time = "2026-03-04T12:00:26.463Z" },
+    { url = "https://files.pythonhosted.org/packages/90/ea/2f4c8616d8ed0b5a3eb4e417b4987ad5a8d9dd9336ed966a8d48ffd45907/xgrammar-0.1.32-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a332c0364f665b410a6cfc2ada155c3a6ede430e385ac431015e31735a64fec3", size = 37682948, upload-time = "2026-03-04T12:00:29.814Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/ae/b9108fadd354ae776c1e7ecd26890a13ac8a30367f9fe8110443aedc4e6a/xgrammar-0.1.32-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5b8ad132d0fcf3a51dc054ecb0dc9808566b302122de6edaac7b4aca460adbec", size = 37709617, upload-time = "2026-03-04T12:00:33.068Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/48/0096bd1f3b460eac48faaecf79418ea3172269dccf37968e78dff5114faf/xgrammar-0.1.32-cp311-cp311-win_amd64.whl", hash = "sha256:b8b1ca6d3f3c2842660458660e494aaf0a6745f1b07ae74e4c2230ab4ff70c11", size = 6632722, upload-time = "2026-03-04T12:00:36.133Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/fd/5e771276fa090e35eaf1cbfdede24b9d93d6bbd2e99cd4f8d558f381fdee/xgrammar-0.1.32-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:9b78d32265f096e5567ab52c72b681855cf473481a48a1e7e6d97d414ba30b82", size = 18425090, upload-time = "2026-03-04T12:00:38.5Z" },
+    { url = "https://files.pythonhosted.org/packages/31/66/f06745755ef0750f43955cf679b4bd8bd88ac8bfab760f020225c192884f/xgrammar-0.1.32-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:23eacaf826c3aeebca0d91fc271417d9d96e157af2bacf6f14277297af7917ef", size = 20582048, upload-time = "2026-03-04T12:00:42.369Z" },
+    { url = "https://files.pythonhosted.org/packages/79/29/3b0306800ccabce8f565123a5b97432dee43822c30142085d9b13b43f166/xgrammar-0.1.32-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f9a637d4e0c541149e0d409c24f4ec79cd74d87508ee6a17a7e64a9b9c0cf56f", size = 37680849, upload-time = "2026-03-04T12:00:46.712Z" },
+    { url = "https://files.pythonhosted.org/packages/69/62/65e664d861cdadf2d788c03dd8fe67f1faaa7bd4bd2317a2ab850aebee20/xgrammar-0.1.32-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f96c7a4fcbd68e18b13cb3b6ed5d24b5326b256933f476bdaf2cc8e609c228db", size = 37711100, upload-time = "2026-03-04T12:00:50.188Z" },
+    { url = "https://files.pythonhosted.org/packages/80/43/05f27a1739209eb590772f867f3f48e6db0a36f376d85db4e68f49aee799/xgrammar-0.1.32-cp312-cp312-win_amd64.whl", hash = "sha256:ba6e08c385cce53eda8e9b3bbfba63f100ba3dcb76fa0692a65921a36b20ad0a", size = 6632259, upload-time = "2026-03-04T12:00:53.184Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/58/b4ff220b28d7d6a4ccf5c229ddbabc7018cd9544356ac8a161086e7a7a0e/xgrammar-0.1.32-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4addb8f5d5699e7df7fca6d299a91b3ef1ad799811c0ab7050d6f96d754c9c21", size = 20582005, upload-time = "2026-03-04T12:00:55.089Z" },
+    { url = "https://files.pythonhosted.org/packages/83/95/9fedafd412af05b1d61859c52fd9d26abc9a167fab66bdad53f832da0956/xgrammar-0.1.32-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:028f8d6a105d06549faee0afbebfaada90aa1941c081dcc88f3d5ef373dad934", size = 37680882, upload-time = "2026-03-04T12:00:59.456Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/21/a9d328ae9ff4e794281995de3a1f8065517bb9bef70f099ab24f7743b3be/xgrammar-0.1.32-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0c0150c50eb3a56a35d6f0c0af0bce0f113ec5f84f7918bfd46b49e25ecf7fb5", size = 37710862, upload-time = "2026-03-04T12:01:02.739Z" },
+    { url = "https://files.pythonhosted.org/packages/28/dc/8ecf71ad1e9c96fd941d2e9a852e184054596eeb1799de8b2e172eaf705e/xgrammar-0.1.32-cp313-cp313-win_amd64.whl", hash = "sha256:e1072d764705c8e87df6136ce3419f96ab3fd423d85f58c2d81c13a647b78894", size = 6632312, upload-time = "2026-03-04T12:01:05.474Z" },
+    { url = "https://files.pythonhosted.org/packages/39/5d/79d524f302ab257f0b6856946e387783f688035360f0c8873b457700e391/xgrammar-0.1.32-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:4e6015ad2b941a292562f68b9a2ee1ddae8e28df840dc39232dcc7007fc6f606", size = 18432652, upload-time = "2026-03-04T12:01:07.366Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/4d/94bdf71b03f94b16265e956d9277fc182384561409b25ede79614fe1fa32/xgrammar-0.1.32-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:8e8da3e7fc194e098b760bacb2b60ad2227cac70d7be5d2e4f7025b1c360c43d", size = 20582170, upload-time = "2026-03-04T12:01:10.012Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/80/30f9dcea0574c46a20cdecf91ab35f882fa4e7ba028ce5ebfeb3afe1d5bb/xgrammar-0.1.32-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6588cfd9754f2c46846276a2e8284a46582a74886d7aaea02cf6ce63ccc397ce", size = 37680819, upload-time = "2026-03-04T12:01:12.958Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/bc/4ff87fbf59a4abd272325d3489ac5aa599bacd8b01ea09fec2ca84eece14/xgrammar-0.1.32-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7f740ba83b69abb423167a5d5b13a9fcde89747220e191f6a004fae4a834311f", size = 37711054, upload-time = "2026-03-04T12:01:17.469Z" },
+    { url = "https://files.pythonhosted.org/packages/62/fa/16b91df8a50798980b60b2c4c800280a3bed50d6a18e55ef6958d30d0faa/xgrammar-0.1.32-cp314-cp314-win_amd64.whl", hash = "sha256:9c0769c3468bd67495c28a03dc5ce3948d83cddaf0a59c6d992b12fc683a1c3e", size = 6718108, upload-time = "2026-03-04T12:01:20.222Z" },
+    { url = "https://files.pythonhosted.org/packages/48/7d/78373114c3ceb5e82cb98bbbde20191477ff5b219f941aa7a535c94bcab8/xgrammar-0.1.32-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:da8339b38e96d105868c14b2cb2df4b7c83d7a49f8539c74fd7470d61043e5b1", size = 18435039, upload-time = "2026-03-04T12:01:22.458Z" },
+    { url = "https://files.pythonhosted.org/packages/61/64/676553d63f74b65887e3ebad86468f557fe0a0ff6373186d300272c7776c/xgrammar-0.1.32-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b938a9096bccc06c30abb5304b2b39c272a924ca002e19421cce5e6ee9670f4f", size = 20584105, upload-time = "2026-03-04T12:01:26.08Z" },
+    { url = "https://files.pythonhosted.org/packages/67/dd/fa6ce458f7b9ab694458683064de08c07509d17c148241000b3d97291383/xgrammar-0.1.32-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fe2ee94080d77b84e38cb6643b75a6ca29cf814a3e5d5da8e1176eae4034d662", size = 37683911, upload-time = "2026-03-04T12:01:29.661Z" },
+    { url = "https://files.pythonhosted.org/packages/80/ba/98675e76c481832a6cbe51aba2b1bf4a9593b5352f9a60c07c5d209e184a/xgrammar-0.1.32-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:70ddbf7216e1e7ec96134a2474a6b84d2b14439a6f6379e079b7c557131be41d", size = 37706596, upload-time = "2026-03-04T12:01:33.264Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/b8/aeafad38d44af75e31101752bcd8fa2a9f4f6b702861813bc7edcfbca266/xgrammar-0.1.32-cp314-cp314t-win_amd64.whl", hash = "sha256:4f68e591a6e9e121d5f03821ab2c44a7af092dc8bf7c9cde1a776871c6bd4dc5", size = 6723286, upload-time = "2026-03-04T12:01:35.866Z" },
 ]
 
 [[package]]
@@ -4774,3 +4877,12 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/48/b7/503c98092fb3b344a179579f55814b613c1fbb1c23b3ec14a7b008a66a6e/yarl-1.22.0-cp314-cp314t-win_arm64.whl", hash = "sha256:9f6d73c1436b934e3f01df1e1b21ff765cd1d28c77dfb9ace207f746d4610ee1", size = 85171, upload-time = "2025-10-06T14:12:16.935Z" },
     { url = "https://files.pythonhosted.org/packages/73/ae/b48f95715333080afb75a4504487cbe142cae1268afc482d06692d605ae6/yarl-1.22.0-py3-none-any.whl", hash = "sha256:1380560bdba02b6b6c90de54133c81c9f2a453dee9912fe58c1dcced1edb7cff", size = 46814, upload-time = "2025-10-06T14:12:53.872Z" },
 ]
+
+[[package]]
+name = "zipp"
+version = "3.23.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" },
+]
diff --git a/vllm_spyre/platform.py b/vllm_spyre/platform.py
index fa6e1ae67..46b0e22ec 100644
--- a/vllm_spyre/platform.py
+++ b/vllm_spyre/platform.py
@@ -17,12 +17,8 @@
 
 import torch
 from vllm.logger import init_logger
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 
-try:
-    # pre 0.11.1 compatibility
-    from vllm.utils import FlexibleArgumentParser  # ty: ignore[unresolved-import]
-except ImportError:
-    from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 if TYPE_CHECKING:
     # NB: We can't eagerly import many things from vllm since vllm.config
@@ -30,24 +26,15 @@
     from vllm.config import ModelConfig, VllmConfig
     from vllm.pooling_params import PoolingParams
     from vllm.sampling_params import SamplingParams
-    from vllm.inputs import ProcessorInputs, PromptType, TokenInputs
-
-    # Try to import new types (0.16.0+)
-    try:
-        from vllm.renderers.inputs import DictPrompt, TokPrompt
-    except ImportError:
-        DictPrompt = None  # type: ignore
-        TokPrompt = None  # type: ignore
+    from vllm.inputs import ProcessorInputs, TokenInputs
+
 else:
     ModelConfig = None
     VllmConfig = None
     SamplingParams = None
     PoolingParams = None
     ProcessorInputs = None
-    PromptType = None
     TokenInputs = None
-    DictPrompt = None
-    TokPrompt = None
 from vllm.platforms import Platform, PlatformEnum
 
 import vllm_spyre.envs as envs_spyre
@@ -103,7 +90,8 @@ def get_device_name(cls, device_id: int = 0) -> str:
 
     @classmethod
     def import_kernels(cls) -> None:
-        pass  # suppress warning
+        # Workaround torch.accelerator.empty_cache for torch 2.7.1 and vllm v0.18.0 compatibility
+        setattr(torch.accelerator, "empty_cache", lambda: None)  # noqa
 
     @classmethod
     def is_async_output_supported(cls, enforce_eager: bool | None) -> bool:
@@ -319,6 +307,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                     "set `--max-num-batched-tokens` to a number that satisfies "
                     "this constraint."
                 )
+            if hasattr(cache_config, "user_specified_block_size"):
+                cache_config.user_specified_block_size = True
 
         logger.info(
             "Configurations for Spyre. max_model_len=%d, max_num_seqs=%d, block_size=%d, "
@@ -436,9 +426,8 @@ def supports_v1(cls, model_config: ModelConfig) -> bool:
     @classmethod
     def validate_request(
         cls,
-        prompt: "PromptType | DictPrompt | TokPrompt",
+        processed_inputs: "ProcessorInputs",
         params: "SamplingParams | PoolingParams",
-        processed_inputs: "ProcessorInputs | None" = None,
     ) -> None:
         """Raises if this request is unsupported on this platform"""
 
@@ -464,18 +453,12 @@ def validate_request(
             )
             params.structured_outputs = None
 
-        if isinstance(prompt, dict) and "prompt_token_ids" in prompt:
-            prompt_len = len(prompt["prompt_token_ids"])  # ty: ignore
-        elif processed_inputs is not None:
-            if "encoder" in processed_inputs:
-                raise ValueError("Encoder-decoder models not supported ")
-            if "prompt_token_ids" not in processed_inputs:
-                # Can't do any extra validation on embedding-only inputs
-                return
-            prompt_len = len(cast(TokenInputs, processed_inputs)["prompt_token_ids"])
-        else:
-            # We need a prompt length to do any validation here
+        if "encoder_prompt" in processed_inputs:
+            raise ValueError("Encoder-decoder models not supported ")
+        if "prompt_token_ids" not in processed_inputs:
+            # Can't do any extra validation on embedding-only inputs
             return
+        prompt_len = len(cast(TokenInputs, processed_inputs)["prompt_token_ids"])
 
         max_tokens = 0
         if params is not None and params.max_tokens is not None:
diff --git a/vllm_spyre/v1/core/scheduler.py b/vllm_spyre/v1/core/scheduler.py
index 164ee227e..5077dc755 100644
--- a/vllm_spyre/v1/core/scheduler.py
+++ b/vllm_spyre/v1/core/scheduler.py
@@ -788,21 +788,27 @@ def check_batch_tkv_limit_cp(
 
     def finish_requests(
         self,
-        request_ids: Union[str, Iterable[str]],
-        finished_status,
-    ) -> None:
+        request_ids: Union[str, Iterable[str], None],
+        finished_status: RequestStatus,
+    ) -> list[tuple[str, int]]:
         """Handles removing finished requests from ongoing_prefills"""
         if isinstance(request_ids, str):
             request_ids = (request_ids,)
 
-        # first defer to vLLM scheduler where validation is handled
-        super(SpyreScheduler, self).finish_requests(
+        # first defer to vLLM scheduler
+        # validates the input requests and generates the output
+        aborted_requests = super(SpyreScheduler, self).finish_requests(
             request_ids=request_ids, finished_status=finished_status
         )
 
-        self.ongoing_prefills = [
-            r for r in self.ongoing_prefills if r.request_id not in request_ids
-        ]
+        # request_ids None means all requests are finished
+        self.ongoing_prefills = (
+            []
+            if request_ids is None
+            else [r for r in self.ongoing_prefills if r.request_id not in request_ids]
+        )
+
+        return aborted_requests
 
     def make_stats(self, *args, **kwargs) -> SchedulerStats | None:
         """Update the scheduler stats from the base scheduler.
diff --git a/vllm_spyre/v1/metrics/stats_logger.py b/vllm_spyre/v1/metrics/stats_logger.py
index 777b86dd0..8898d4a97 100644
--- a/vllm_spyre/v1/metrics/stats_logger.py
+++ b/vllm_spyre/v1/metrics/stats_logger.py
@@ -9,17 +9,15 @@
 from vllm.logger import init_logger
 from vllm.v1.engine import async_llm, llm_engine
 from vllm.v1.metrics.loggers import StatLoggerBase, StatLoggerManager
-from vllm.v1.metrics.stats import FinishedRequestStats, IterationStats, SchedulerStats
+from vllm.v1.metrics.stats import (
+    FinishedRequestStats,
+    IterationStats,
+    MultiModalCacheStats,
+    SchedulerStats,
+)
 
 from vllm_spyre import envs as envs_spyre
 
-try:
-    from vllm.v1.metrics.stats import MultiModalCacheStats
-except ImportError:
-    # compatibility for vllm pre 0.11.1
-    class MultiModalCacheStats:
-        pass
-
 
 logger = init_logger(__name__)
 
diff --git a/vllm_spyre/v1/worker/spyre_input_batch.py b/vllm_spyre/v1/worker/spyre_input_batch.py
index b36764d48..528b10a7c 100644
--- a/vllm_spyre/v1/worker/spyre_input_batch.py
+++ b/vllm_spyre/v1/worker/spyre_input_batch.py
@@ -5,7 +5,7 @@
 
 from abc import abstractmethod
 from dataclasses import dataclass, field
-from typing import Any, Generic, TypeVar, cast
+from typing import Generic, TypeVar, cast
 
 import numpy as np
 import torch
@@ -20,7 +20,6 @@
 from vllm.v1.sample.metadata import SamplingMetadata
 
 from vllm_spyre.v1.sample.spyre_logits_processor import LogitProcessorWrapper
-from vllm_spyre.compat_utils import has_argument
 
 
 @dataclass
@@ -742,12 +741,9 @@ def make_pooling_metadata(self) -> PoolingMetadata:
         assert len(self.requests_ids) == len(self.pooling_params)
         pooling_params = [self.pooling_params[req_id] for req_id in self.requests_ids]
 
-        kwargs: dict[str, Any] = {}
-        if has_argument(PoolingMetadata, "pooling_states"):
-            kwargs["pooling_states"] = []
         return PoolingMetadata(
             prompt_lens=torch.from_numpy(self._get_num_prompt_tokens()).to(self.device),
             prompt_token_ids=prompt_token_ids,
             pooling_params=pooling_params,
-            **kwargs,
+            pooling_states=[],
         )
diff --git a/vllm_spyre/v1/worker/spyre_model_runner.py b/vllm_spyre/v1/worker/spyre_model_runner.py
index 728301295..8f41a447e 100644
--- a/vllm_spyre/v1/worker/spyre_model_runner.py
+++ b/vllm_spyre/v1/worker/spyre_model_runner.py
@@ -15,12 +15,8 @@
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingType
 
-try:
-    # pre 0.11.1 compatibility
-    from vllm.utils import get_hash_fn_by_name, is_pin_memory_available  # ty: ignore[unresolved-import]
-except ImportError:
-    from vllm.utils.platform_utils import is_pin_memory_available
-    from vllm.utils.hashing import get_hash_fn_by_name
+from vllm.utils.platform_utils import is_pin_memory_available
+from vllm.utils.hashing import get_hash_fn_by_name
 
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_utils import KVCacheBlock, get_request_block_hasher, init_none_hash
@@ -35,7 +31,6 @@
 
 import vllm_spyre.envs as envs_spyre
 import vllm_spyre.utils as utils_spyre
-from vllm_spyre.compat_utils import has_argument
 from vllm_spyre.model_executor.model_loader.spyre import (
     BACKEND_LIST,
     SpyreAttentionMetadata,
@@ -955,14 +950,11 @@ def _set_blocks(self, num_blocks: int) -> None:
         self.kv_cache_manager = self._make_kv_cache_manager()
 
     def _make_block_pool(self) -> BlockPool:
-        kwargs = {}
-        if has_argument(BlockPool, "hash_block_size"):
-            kwargs["hash_block_size"] = self.block_size
         return BlockPool(
             num_gpu_blocks=self.n_blocks + 1,
             enable_caching=self.enable_prefix_caching,
             enable_kv_cache_events=False,
-            **kwargs,
+            hash_block_size=self.block_size,
         )
 
     def _make_kv_cache_manager(self) -> FullAttentionManager:
@@ -974,33 +966,24 @@ def _make_kv_cache_manager(self) -> FullAttentionManager:
             dtype=torch.float16,
         )
 
-        # Enable_caching parameter added in vllm v0.14.0
-        kwargs = {
-            "kv_cache_spec": self._attn_spec,
-            "block_pool": self.block_pool,
+        return FullAttentionManager(
+            kv_cache_spec=self._attn_spec,
+            block_pool=self.block_pool,
             # Currently don't support models with more than one
             # attention type, e.g. full and sliding window, so
             # there is only one group.
-            "kv_cache_group_id": 0,
+            kv_cache_group_id=0,
             # We don't support DCP
             # https://docs.vllm.ai/en/latest/serving/context_parallel_deployment/#decode-context-parallel
-            "dcp_world_size": 1,
-        }
-
-        # Conditionally add param for vLLM >= 0.14.0
-        if has_argument(FullAttentionManager.__init__, "enable_caching"):
-            kwargs["enable_caching"] = self.enable_prefix_caching
-
-        return FullAttentionManager(**kwargs)  # ty: ignore[invalid-argument-type]
+            dcp_world_size=1,
+            enable_caching=self.enable_prefix_caching,
+        )  # ty: ignore[invalid-argument-type]
 
     def _allocate_new_blocks_wrapper(self, req_id: str, num_tokens: int):
-        """Backwards compatibility for change to interface in v0.15.0"""
-        kwargs: dict[str, Any] = {
-            "num_tokens": num_tokens,
-        }
-        if has_argument(self.kv_cache_manager.allocate_new_blocks, "num_tokens_main_model"):
-            kwargs["num_tokens_main_model"] = num_tokens
-        return self.kv_cache_manager.allocate_new_blocks(req_id, **kwargs)
+        """Wrapper for allocating new blocks"""
+        return self.kv_cache_manager.allocate_new_blocks(
+            req_id, num_tokens=num_tokens, num_tokens_main_model=num_tokens
+        )
 
     def _get_blocks(self, request_id: str) -> list[KVCacheBlock]:
         return self.kv_cache_manager.req_to_blocks[request_id]
@@ -1941,17 +1924,11 @@ def execute_model(
         pooling_metadata = self.input_batch.make_pooling_metadata()
 
         ## No partial prefill, hence we can use the prompt lens here
-        cursor_kwargs: dict[str, Any] = {}
-        if has_argument(pooling_metadata.build_pooling_cursor, "seq_lens_cpu"):
-            cursor_kwargs["seq_lens_cpu"] = pooling_metadata.prompt_lens
-
-        # v0.14.0 uses param "num_scheduled_tokens_np"
-        if has_argument(pooling_metadata.build_pooling_cursor, "num_scheduled_tokens_np"):
-            cursor_kwargs["num_scheduled_tokens_np"] = pooling_metadata.prompt_lens.numpy()
-        else:
-            cursor_kwargs["num_scheduled_tokens"] = pooling_metadata.prompt_lens.tolist()
-
-        pooling_metadata.build_pooling_cursor(device=self.device, **cursor_kwargs)
+        pooling_metadata.build_pooling_cursor(
+            device=self.device,
+            seq_lens_cpu=pooling_metadata.prompt_lens,
+            num_scheduled_tokens_np=pooling_metadata.prompt_lens.numpy(),
+        )
 
         # prepare unpadded output for the pooler
         hidden_state_list: list[torch.Tensor] = []
@@ -2002,11 +1979,7 @@ def __init__(
         # For hybrid KV caches, the `alignment_tokens` arg needs to be set to
         # the lowest common multiple of kv cache block sizes. Currently we only
         # support homogeneous kv caches with a single block size though.
-        self._alignment_token_kwargs = (
-            {"alignment_tokens": self.block_size}
-            if has_argument(FullAttentionManager.find_longest_cache_hit, "alignment_tokens")
-            else {}
-        )
+        self._alignment_token_kwargs = {"alignment_tokens": self.block_size}
 
         if vllm_config.cache_config.enable_prefix_caching:
             caching_hash_fn = get_hash_fn_by_name(vllm_config.cache_config.prefix_caching_hash_algo)
@@ -2384,20 +2357,12 @@ def _plan_chunking(self, scheduler_request: Request) -> ChunkedPrefillPlan:
             # blocks in the last chunk to deduplicate the used blocks. So
             # although we will recompute, we'll still point the block table
             # to the cached blocks.
-            try:
-                # vllm >= v0.14.0
-                self.kv_cache_manager.allocate_new_computed_blocks(
-                    request_id=scheduler_request.request_id,
-                    new_computed_blocks=computed_blocks,
-                    num_local_computed_tokens=len(computed_blocks) * self.block_size,
-                    num_external_computed_tokens=0,
-                )
-            except (AttributeError, TypeError):
-                # vllm < v0.14.0
-                self.kv_cache_manager.save_new_computed_blocks(
-                    scheduler_request.request_id,
-                    computed_blocks,
-                )
+            self.kv_cache_manager.allocate_new_computed_blocks(
+                request_id=scheduler_request.request_id,
+                new_computed_blocks=computed_blocks,
+                num_local_computed_tokens=len(computed_blocks) * self.block_size,
+                num_external_computed_tokens=0,
+            )
         else:
             usable_blocks = 0
             n_hit = 0
@@ -2432,7 +2397,6 @@ def add_new_request(self, request: NewRequestData):
             prompt_token_ids=prompt_token_ids,
             sampling_params=request.sampling_params,
             pooling_params=None,
-            eos_token_id=None,
             block_hasher=self.request_block_hasher,
             mm_features=mm_features,
         )
diff --git a/vllm_spyre/v1/worker/spyre_worker.py b/vllm_spyre/v1/worker/spyre_worker.py
index fb9ecf567..a80eba75f 100644
--- a/vllm_spyre/v1/worker/spyre_worker.py
+++ b/vllm_spyre/v1/worker/spyre_worker.py
@@ -21,12 +21,7 @@
 from vllm.distributed import ensure_model_parallel_initialized, init_distributed_environment
 from vllm.logger import init_logger
 
-try:
-    # vllm >= v0.14.0
-    from vllm.utils.torch_utils import set_random_seed
-except ImportError:
-    # vllm < v0.14.0
-    from vllm.model_executor import set_random_seed  # ty: ignore[unresolved-import]
+from vllm.utils.torch_utils import set_random_seed
 
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
@@ -154,12 +149,15 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
         """
         return self.model_runner.get_kv_cache_spec()
 
-    def compile_or_warm_up_model(self) -> None:
-        """Prepare model for execution through compilation/warmup."""
+    def compile_or_warm_up_model(self) -> float:
+        """Prepare model for execution through compilation/warmup.
+
+        Returns:
+            The accumulated compilation time in seconds.
+        """
 
         if envs_spyre.VLLM_SPYRE_USE_CB:
-            self._warmup_spyre_dynamic_size(self.restricted_tokens)
-            return
+            return self._warmup_spyre_dynamic_size(self.restricted_tokens)
         if self.model_runner.is_multimodal:
             raise NotImplementedError(
                 "[WARMUP] Static batching is not supported for multimodal models."
@@ -207,6 +205,7 @@ def compile_or_warm_up_model(self) -> None:
             num_shape_combinations,
             all_warmup_total_t,
         )
+        return all_warmup_total_t
 
     def check_health(self) -> None:
         """Basic health check (override for device-specific checks)."""
@@ -262,24 +261,13 @@ def __init__(
         distributed_init_method: str,
         is_driver_worker: bool = False,
     ) -> None:
-        try:
-            # pre 0.11.1 compatibility with old worker base class
-            from vllm.worker.worker_base import WorkerBase as LegacyWorkerBase  # ty: ignore
-
-            LegacyWorkerBase.__init__(self, vllm_config=vllm_config)
-            self.local_rank = local_rank
-            self.rank = rank
-            self.distributed_init_method = distributed_init_method
-            self.is_driver_worker = is_driver_worker
-        except ImportError:
-            # From 0.11.1 and on we should only have to call the super init
-            super().__init__(
-                vllm_config=vllm_config,
-                local_rank=local_rank,
-                rank=rank,
-                distributed_init_method=distributed_init_method,
-                is_driver_worker=is_driver_worker,
-            )
+        super().__init__(
+            vllm_config=vllm_config,
+            local_rank=local_rank,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            is_driver_worker=is_driver_worker,
+        )
 
         # For power-user debugging of spyre logs for tensor parallel ops
         self.redirect_logs_to_files()
@@ -289,22 +277,6 @@ def __init__(
             assert rank % self.parallel_config.tensor_parallel_size == 0, (
                 "Driver worker should be rank 0 of tensor parallel group."
             )
-        if self.model_config.trust_remote_code:
-            # note: lazy import to avoid importing torch before initializing
-            try:
-                # pre 0.11.1 compatibility
-                from vllm.utils import init_cached_hf_modules  # ty: ignore[unresolved-import]
-
-                init_cached_hf_modules()
-            except ImportError:
-                # 0.11.1 to 0.13.0 compatibility
-                try:
-                    from vllm.utils.import_utils import init_cached_hf_modules  # ty: ignore[unresolved-import]
-
-                    init_cached_hf_modules()
-                except ImportError:
-                    # >=0.14.0, init_cached_hf_modules is no longer needed
-                    pass
 
         self.model_runner: Union[
             StaticBatchingSpyreModelRunner,
@@ -576,7 +548,7 @@ def load_model(self):
         self.perf_metrics.log("load model time", load_model_total_t, model=self.model_config.model)
         logger.info("load model took %.3fs", load_model_total_t)
 
-    def _warmup_spyre_dynamic_size(self, special_token_ids):
+    def _warmup_spyre_dynamic_size(self, special_token_ids) -> float:
         warmup_start_t = time.time()
 
         # satisfy mypy
@@ -670,9 +642,6 @@ def _warmup_spyre_dynamic_size(self, special_token_ids):
             scheduled_cached_reqs=CachedRequestData.make_empty(),
             num_scheduled_tokens={deploy_req.req_id: prompt_len},
             total_num_scheduled_tokens=prompt_len,
-            scheduled_spec_decode_tokens={},
-            scheduled_encoder_inputs={},
-            num_common_prefix_blocks=[],
             finished_req_ids=set(),
             **_get_extra_args(),
         )
@@ -692,6 +661,7 @@ def _warmup_spyre_dynamic_size(self, special_token_ids):
         )
 
         maybe_override_signals_handler()
+        return warmup_total_t
 
     def _cleanup_model_runner(self, request) -> None:
         # Needed to clean up the data of model runner
@@ -701,9 +671,6 @@ def _cleanup_model_runner(self, request) -> None:
             num_scheduled_tokens={},
             # NOTE: this means no work to do
             total_num_scheduled_tokens=0,
-            scheduled_spec_decode_tokens={},
-            scheduled_encoder_inputs={},
-            num_common_prefix_blocks=[],
             # The requests to be removed
             finished_req_ids=set([r.req_id for r in request]),
             **_get_extra_args(),
@@ -783,9 +750,6 @@ def _warmup_spyre_fixed_size(
             scheduled_cached_reqs=cached_request_data,
             num_scheduled_tokens={r.req_id: self._get_num_tokens(r) for r in dummy_requests},
             total_num_scheduled_tokens=sum(prompt_len for _ in range(batch_size)),
-            scheduled_spec_decode_tokens={},
-            scheduled_encoder_inputs={},
-            num_common_prefix_blocks=[],
             finished_req_ids=set(),
             **_get_extra_args(),
         )
@@ -855,9 +819,6 @@ def _dynamic_warmup(
                 scheduled_cached_reqs=CachedRequestData.make_empty(),
                 num_scheduled_tokens={req.req_id: prompt_len},
                 total_num_scheduled_tokens=prompt_len,
-                scheduled_spec_decode_tokens={},
-                scheduled_encoder_inputs={},
-                num_common_prefix_blocks=[],
                 finished_req_ids=set(),
                 **_get_extra_args(),
             )
@@ -886,9 +847,6 @@ def _dynamic_warmup(
             scheduled_cached_reqs=cached_request_data,
             num_scheduled_tokens={req.req_id: 1 for req in requests},
             total_num_scheduled_tokens=1,
-            scheduled_spec_decode_tokens={},
-            scheduled_encoder_inputs={},
-            num_common_prefix_blocks=[],
             finished_req_ids=set(),
             **_get_extra_args(),
         )
@@ -996,14 +954,10 @@ def signal_handler(signum, frame):
 
 
 def _get_extra_args() -> dict:
-    """Add any required backwards compatibility code for constructing
-    SchedulerOutputs here"""
-    extra_args: dict = {}
-    extra_args.update({"free_encoder_mm_hashes": []})
-
-    if "structured_output_request_ids" in dataclass_fields(SchedulerOutput):
-        extra_args["structured_output_request_ids"] = {}
-    if "grammar_bitmask" in dataclass_fields(SchedulerOutput):
-        extra_args["grammar_bitmask"] = None
-
-    return extra_args
+    """Add any required extra args for constructing SchedulerOutputs"""
+    return {
+        "free_encoder_mm_hashes": [],
+        "scheduled_spec_decode_tokens": {},
+        "scheduled_encoder_inputs": {},
+        "num_common_prefix_blocks": [],
+    }