diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 960bbe744..a30445950 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -84,7 +84,7 @@ jobs: # Lower bound support - vllm_version: name: "vLLM:lowest" - repo: "git+https://github.com/vllm-project/vllm --tag v0.11.0" + repo: "git+https://github.com/vllm-project/vllm --tag v0.17.0" test_suite: name: "backward compat" markers: "compat or (cpu and basic and not quantized and not sb)" @@ -94,86 +94,9 @@ jobs: os: "ubuntu-latest" python_version: "3.12" # Intermediate versions of vllm to check basic support for as well - - vllm_version: - name: "vLLM:0.11.1" - repo: "git+https://github.com/vllm-project/vllm --tag v0.11.1" - test_suite: - name: "backward compat" - markers: "cpu and basic and not quantized and not sb" - flags: "--timeout=300" - hf_model_2: "sentence-transformers/all-roberta-large-v1" - hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c" - os: "ubuntu-latest" - python_version: "3.12" - - vllm_version: - name: "vLLM:0.11.2" - repo: "git+https://github.com/vllm-project/vllm --tag v0.11.2" - test_suite: - name: "backward compat" - markers: "cpu and basic and not quantized and not sb" - flags: "--timeout=300" - hf_model_2: "sentence-transformers/all-roberta-large-v1" - hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c" - os: "ubuntu-latest" - python_version: "3.12" - - vllm_version: - name: "vLLM:0.12.0" - repo: "git+https://github.com/vllm-project/vllm --tag v0.12.0" - test_suite: - name: "backward compat" - markers: "cpu and basic and not quantized and not sb" - flags: "--timeout=300" - hf_model_2: "sentence-transformers/all-roberta-large-v1" - hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c" - os: "ubuntu-latest" - python_version: "3.12" - - vllm_version: - name: "vLLM:0.13.0" - repo: "git+https://github.com/vllm-project/vllm --tag v0.13.0" - test_suite: - name: "backward compat" - markers: "cpu and basic and not quantized and not sb" - flags: "--timeout=300" - hf_model_2: "sentence-transformers/all-roberta-large-v1" - hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c" - os: "ubuntu-latest" - python_version: "3.12" - - vllm_version: - name: "vLLM:0.14.0" - repo: "git+https://github.com/vllm-project/vllm --tag v0.14.0" - test_suite: - name: "backward compat" - markers: "cpu and basic and not quantized and not sb" - flags: "--timeout=300" - hf_model_2: "sentence-transformers/all-roberta-large-v1" - hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c" - os: "ubuntu-latest" - python_version: "3.12" - - vllm_version: - name: "vLLM:0.14.1" - repo: "git+https://github.com/vllm-project/vllm --tag v0.14.1" - test_suite: - name: "backward compat" - markers: "cpu and basic and not quantized and not sb" - flags: "--timeout=300" - hf_model_2: "sentence-transformers/all-roberta-large-v1" - hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c" - os: "ubuntu-latest" - python_version: "3.12" - - vllm_version: - name: "vLLM:0.15.0" - repo: "git+https://github.com/vllm-project/vllm --tag v0.15.0" - test_suite: - name: "backward compat" - markers: "cpu and basic and not quantized and not sb" - flags: "--timeout=300" - hf_model_2: "sentence-transformers/all-roberta-large-v1" - hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c" - os: "ubuntu-latest" - python_version: "3.12" - vllm_version: name: "vLLM:0.15.1" - repo: "git+https://github.com/vllm-project/vllm --tag v0.15.1" + repo: "git+https://github.com/vllm-project/vllm --tag v0.17.1" test_suite: name: "backward compat" markers: "cpu and basic and not quantized and not sb" diff --git a/pyproject.toml b/pyproject.toml index 9d2883319..736a67f6d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ license = {text = "Apache 2"} dependencies = [ "fms-model-optimizer[fp8]>=0.8.0", "ibm-fms>=1.7.0,<2.0", - "vllm>=0.11.0,<0.16.1", + "vllm>=0.17.0,<0.18.1", ] requires-python = ">=3.11" dynamic = ["version"] @@ -70,7 +70,7 @@ environments = [ ] [tool.uv.sources] -vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.16.0" } +vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.18.0" } [tool.ty.rules] possibly-missing-attribute = "ignore" diff --git a/tests/conftest.py b/tests/conftest.py index 3e38915ab..ef6fb48b6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -214,6 +214,8 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool): SpyrePlatform._used_with_cli = False yield if should_do_global_cleanup_after_test: + # Workaround torch.accelerator.empty_cache for torch 2.7.1 and vllm v0.18.0 compatibility + setattr(torch.accelerator, "empty_cache", lambda: None) # noqa cleanup_dist_env_and_memory() diff --git a/tests/e2e/test_chunked_prefill_tkv_steps.py b/tests/e2e/test_chunked_prefill_tkv_steps.py index 7caf53ffb..7a7896101 100644 --- a/tests/e2e/test_chunked_prefill_tkv_steps.py +++ b/tests/e2e/test_chunked_prefill_tkv_steps.py @@ -119,9 +119,6 @@ def make_scheduler_output( scheduled_cached_reqs=scheduled_cached_reqs, num_scheduled_tokens=num_scheduled_tokens, total_num_scheduled_tokens=total_tokens, - scheduled_spec_decode_tokens={}, - scheduled_encoder_inputs={}, - num_common_prefix_blocks=[], finished_req_ids=finished_req_ids, kv_connector_metadata=None, **extra_args, @@ -134,7 +131,6 @@ def make_new_request_data(req_id, prompt_len): prompt_token_ids=[42] * prompt_len, sampling_params=SamplingParams(), pooling_params=None, - eos_token_id=None, ) return NewRequestData.from_request(req, block_ids=[]) diff --git a/tests/scheduling_utils.py b/tests/scheduling_utils.py index 6a0a2c815..59326b618 100644 --- a/tests/scheduling_utils.py +++ b/tests/scheduling_utils.py @@ -120,7 +120,6 @@ def create_request_for_scheduler_test( request_id=str(request_id), sampling_params=sampling_params, prompt_token_ids=prompt, - eos_token_id=None, arrival_time=0, lora_request=None, pooling_params=None, diff --git a/tests/spyre_util.py b/tests/spyre_util.py index 50df7d71a..fe11dea7b 100644 --- a/tests/spyre_util.py +++ b/tests/spyre_util.py @@ -22,13 +22,8 @@ from vllm_spyre.platform import SpyrePlatform from vllm_spyre import envs -try: - # old - from vllm.utils import FlexibleArgumentParser, get_open_port -except ImportError: - # new - from vllm.utils.argparse_utils import FlexibleArgumentParser - from vllm.utils.network_utils import get_open_port +from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.network_utils import get_open_port from vllm.v1.request import Request @@ -448,7 +443,6 @@ def create_random_request( request_id=str(request_id), prompt_token_ids=prompt_token_ids, sampling_params=sampling_params, - eos_token_id=None, arrival_time=0, lora_request=None, pooling_params=None, diff --git a/tests/utils/test_cli_args.py b/tests/utils/test_cli_args.py index 34996445b..9c1420637 100644 --- a/tests/utils/test_cli_args.py +++ b/tests/utils/test_cli_args.py @@ -10,12 +10,7 @@ from vllm_spyre.config.model_registry import get_model_registry from spyre_util import environ_checkpoint, REFERENCE_MODELS -try: - # old - from vllm.utils import FlexibleArgumentParser -except ImportError: - # new - from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser global_default = 192 @@ -69,8 +64,6 @@ def sendnn_configured() -> bool: "32", "-tp", "4", - "--swap-space", # to prevent a validation error in the 16GB memory test env. - "1", ] if model_name == "ibm-granite/granite-3.3-8b-instruct": diff --git a/tests/utils/test_platform_validation.py b/tests/utils/test_platform_validation.py index 36cb078c7..1f1a11653 100644 --- a/tests/utils/test_platform_validation.py +++ b/tests/utils/test_platform_validation.py @@ -4,8 +4,11 @@ from SamplingParams during request validation. """ +from unittest.mock import MagicMock import pytest + from vllm import SamplingParams +from vllm.inputs.data import token_inputs from vllm.pooling_params import PoolingParams from vllm.sampling_params import StructuredOutputsParams from vllm_spyre.platform import SpyrePlatform @@ -13,6 +16,17 @@ pytestmark = pytest.mark.skip_global_cleanup +@pytest.fixture(autouse=True) +def mock_spyre_config(): + """Mock SpyrePlatform._config for all tests.""" + original_config = SpyrePlatform._config + mock_config = MagicMock() + mock_config.model_config.max_model_len = 512 + SpyrePlatform._config = mock_config + yield mock_config + SpyrePlatform._config = original_config + + class TestStructuredOutputValidation: """Test that platform validation strips structured outputs from requests.""" @@ -24,7 +38,8 @@ def test_strips_structured_outputs(self): assert params.structured_outputs is not None - SpyrePlatform.validate_request("Test prompt", params) + processed_inputs = token_inputs(prompt_token_ids=[1, 2, 3]) + SpyrePlatform.validate_request(processed_inputs, params) assert params.structured_outputs is None @@ -34,7 +49,8 @@ def test_logs_warning_when_stripping(self, caplog_vllm_spyre): max_tokens=20, structured_outputs=StructuredOutputsParams(json_object=True) ) - SpyrePlatform.validate_request("Test prompt", params) + processed_inputs = token_inputs(prompt_token_ids=[1, 2, 3]) + SpyrePlatform.validate_request(processed_inputs, params) assert len(caplog_vllm_spyre.records) > 0 warning_record = caplog_vllm_spyre.records[0] @@ -55,7 +71,8 @@ def test_strips_different_structured_output_types(self, structured_output): assert params.structured_outputs is not None - SpyrePlatform.validate_request("Test prompt", params) + processed_inputs = token_inputs(prompt_token_ids=[1, 2, 3]) + SpyrePlatform.validate_request(processed_inputs, params) assert params.structured_outputs is None @@ -77,7 +94,8 @@ def test_preserves_other_sampling_params(self): "top_k": params.top_k, } - SpyrePlatform.validate_request("Test prompt", params) + processed_inputs = token_inputs(prompt_token_ids=[1, 2, 3]) + SpyrePlatform.validate_request(processed_inputs, params) # Verify other params are unchanged assert params.max_tokens == original_values["max_tokens"] @@ -92,7 +110,8 @@ def test_does_not_affect_pooling_params(self): pooling_params = PoolingParams() # Should not raise any errors and should return early - SpyrePlatform.validate_request("Test prompt", pooling_params) + processed_inputs = token_inputs(prompt_token_ids=[1, 2, 3]) + SpyrePlatform.validate_request(processed_inputs, pooling_params) # PoolingParams don't have structured_outputs, so just verify no exception assert True # If we got here, the early return worked diff --git a/tests/utils/test_upstream_compatibility.py b/tests/utils/test_upstream_compatibility.py index c73f64129..7cb64b261 100644 --- a/tests/utils/test_upstream_compatibility.py +++ b/tests/utils/test_upstream_compatibility.py @@ -1,164 +1,21 @@ +""" +This file previously contained backwards compatibility tests for vLLM versions < 0.17.0. +All backwards compatibility code has been removed as the minimum supported version is now v0.17.0. +""" + import os import pytest -from vllm.v1.core.block_pool import BlockPool -from vllm.v1.core.sched.output import SchedulerOutput -from vllm.v1.core.single_type_kv_cache_manager import FullAttentionManager - -from vllm_spyre.compat_utils import dataclass_fields, has_argument pytestmark = pytest.mark.compat VLLM_VERSION = os.getenv("TEST_VLLM_VERSION", "default") -def test_pin_memory_available(): - if VLLM_VERSION == "vLLM:lowest": - try: - from vllm.utils import is_pin_memory_available # # noqa #ty: ignore - from vllm.utils import make_tensor_with_pad # # noqa #ty: ignore - from vllm.utils import init_cached_hf_modules # # noqa #ty: ignore - except ImportError as e: - raise AssertionError( - "remove backwards compatibility imports for " - "is_pin_memory_available, " - "make_tensor_with_pad and init_cached_hf_modules" - ) from e - - -def test_multi_modal_cache_stats(): - if VLLM_VERSION == "vLLM:lowest": - # If this import succeeds then remove the backwards compatibility type - # def for MultiModalCacheStats - with pytest.raises(ImportError): - from vllm.v1.metrics.stats import MultiModalCacheStats # # noqa #ty: ignore - - -def test_v0_worker_base(): - if VLLM_VERSION == "vLLM:lowest": - try: - from vllm.worker.worker_base import WorkerBase # # noqa #ty: ignore - except ImportError as e: - raise AssertionError( - "remove the backwards compatibility code from the SpyreWorker initializer" - ) from e - - -def test_structured_output_request_ids(): - if VLLM_VERSION == "vLLM:lowest": - # Can remove "structured_output_request_ids" and "grammar_bitmask" - # from backwards compat - assert "structured_output_request_ids" in dataclass_fields(SchedulerOutput) - - -def test_hash_block_size(): - if VLLM_VERSION == "vLLM:lowest": - # Can supply `hash_block_size` everywhere, this was added in 0.12.0 - assert not has_argument(BlockPool, "hash_block_size") - - -def test_alignment_tokens(): - if VLLM_VERSION == "vLLM:lowest": - # Can supply `alignment_tokens` everywhere, this was added in 0.12.0 - assert not has_argument(FullAttentionManager.find_longest_cache_hit, "alignment_tokens") - - -def test_argparse_utils(): - if VLLM_VERSION == "vLLM:lowest": - try: - from vllm.utils import FlexibleArgumentParser # noqa - except ImportError as e: - raise AssertionError( - "Fix backward compatible imports of " - "FlexibleArgumentParser which is no longer required" - ) from e - - -def test_pooler_api(): - if VLLM_VERSION == "vLLM:lowest": - try: - from vllm.model_executor.layers.pooler import ClassifierPooler, Pooler # noqa - except ImportError as e: - raise AssertionError( - "Backwards compatibility code for old pooler API " - "ClassifierPooler no longer required, related to vLLM PR #31973" - ) from e - - -def test_set_random_seed(): - if VLLM_VERSION == "vLLM:lowest": - try: - from vllm.model_executor import set_random_seed # noqa - except ImportError as e: - raise AssertionError( - "Backwards compatibility code for set_random_seed import no longer required" - ) from e - - -def test_enable_caching(): - if VLLM_VERSION == "vLLM:lowest": - # Can supply enable_caching everywhere, added in v0.14.0 - assert not has_argument(FullAttentionManager.__init__, "enable_caching"), ( - "Backwards compatibility code for enable_caching parameter " - "in FullAttentionManager no longer required" - ) - - -def test_pooling_metadata_build_cursor(): - if VLLM_VERSION == "vLLM:lowest": - from vllm.v1.pool.metadata import PoolingMetadata - - assert has_argument(PoolingMetadata.build_pooling_cursor, "num_scheduled_tokens"), ( - "Backwards compatibility code for num_scheduled_tokens parameter " - "in PoolingMetadata.build_pooling_cursor no longer required " - ) - - -def test_allocate_new_computed_blocks(): - if VLLM_VERSION == "vLLM:lowest": - # allocate_new_computed_blocks was added in v0.14.0 - # When save_new_computed_blocks no longer exists, remove the - # try/except compatibility code in spyre_model_runner.py - assert hasattr(FullAttentionManager, "save_new_computed_blocks"), ( - "Backwards compatibility code for save_new_computed_blocks " - "in FullAttentionManager no longer required, can use " - "allocate_new_computed_blocks everywhere" - ) - - -def test_allocate_new_blocks_new_arg(): - if VLLM_VERSION == "vLLM:lowest": - # allocate_new_blocks added an argument in v0.15.0 - # When that is our lowest, we can remove compat code that checks for the - # num_tokens_main_model argument (see _allocate_new_blocks_wrapper in - # spyre_model_runner.py) - assert not hasattr(FullAttentionManager.allocate_new_blocks, "num_tokens_main_model"), ( - "Backwards compatibility code checking existence of " - "num_tokens_main_model argument to allocate_new_blocks " - "in FullAttentionManager no longer required" - ) - - -def test_profiler_config(): - if VLLM_VERSION == "vLLM:lowest": - # When ProfilerConfig exists in lowest version, remove env var fallback - with pytest.raises(ImportError): - from vllm.profiler.wrapper import TorchProfilerWrapper # noqa - - -def test_multimodal_field_elem_signature(): - if VLLM_VERSION == "vLLM:lowest": - from vllm.multimodal.inputs import MultiModalFieldElem - - # When modality parameter is removed in lowest version, remove compat code - assert has_argument(MultiModalFieldElem.__init__, "modality"), ( - "Backwards compatibility code for MultiModalFieldElem modality/key " - "parameters no longer required" - ) - - -def test_dict_prompt_tok_prompt(): - if VLLM_VERSION == "vLLM:lowest": - # When these types exist in lowest version, remove try/except imports - with pytest.raises(ImportError): - from vllm.renderers.inputs import DictPrompt, TokPrompt # noqa +def test_minimum_version_is_017(): + """ + Verify that the minimum vLLM version is 0.17.0. + All backwards compatibility code for versions < 0.17.0 has been removed. + """ + # This test serves as documentation that v0.17.0 is the minimum supported version + assert True, "Minimum vLLM version is now 0.17.0" diff --git a/tests/v1/core/test_scheduler_structured_outputs.py b/tests/v1/core/test_scheduler_structured_outputs.py index 4282de668..27ced6b3b 100644 --- a/tests/v1/core/test_scheduler_structured_outputs.py +++ b/tests/v1/core/test_scheduler_structured_outputs.py @@ -73,7 +73,6 @@ def test_scheduler_strips_structured_output_request(self, mocked_scheduler, capl request_id="test_req", sampling_params=sampling_params, prompt_token_ids=list(range(50)), - eos_token_id=None, arrival_time=0, lora_request=None, pooling_params=None, @@ -110,7 +109,6 @@ def test_scheduler_handles_request_without_structured_output(self, mocked_schedu request_id="test_req", sampling_params=sampling_params, prompt_token_ids=list(range(50)), - eos_token_id=None, arrival_time=0, lora_request=None, pooling_params=None, @@ -146,7 +144,6 @@ def test_scheduler_handles_multiple_requests_with_structured_outputs( request_id=f"test_req_{i}", sampling_params=sampling_params, prompt_token_ids=list(range(50)), - eos_token_id=None, arrival_time=i, lora_request=None, pooling_params=None, @@ -189,7 +186,6 @@ def test_scheduler_only_strips_when_can_schedule_prefill_true(self, mocked_sched request_id="test_req", sampling_params=sampling_params, prompt_token_ids=list(range(50)), - eos_token_id=None, arrival_time=0, lora_request=None, pooling_params=None, @@ -224,7 +220,6 @@ def test_scheduler_preserves_other_request_attributes( request_id="test_req", sampling_params=sampling_params, prompt_token_ids=list(range(50)), - eos_token_id=100, arrival_time=1.5, lora_request=None, pooling_params=None, @@ -233,7 +228,6 @@ def test_scheduler_preserves_other_request_attributes( # Store original values original_request_id = request.request_id original_prompt_tokens = list(request.prompt_token_ids) if request.prompt_token_ids else [] - original_eos_token = request.eos_token_id original_arrival_time = request.arrival_time original_sampling_params = request.sampling_params @@ -245,7 +239,6 @@ def test_scheduler_preserves_other_request_attributes( # Verify other attributes are unchanged assert request.request_id == original_request_id assert request.prompt_token_ids == original_prompt_tokens - assert request.eos_token_id == original_eos_token assert request.arrival_time == original_arrival_time assert request.sampling_params is original_sampling_params # But structured_output_request should be None diff --git a/tests/v1/worker/mock_model.py b/tests/v1/worker/mock_model.py index 8694415a1..ee3880e92 100644 --- a/tests/v1/worker/mock_model.py +++ b/tests/v1/worker/mock_model.py @@ -1,4 +1,3 @@ -from dataclasses import fields from typing import Any import pytest @@ -147,12 +146,7 @@ def _schedule_new_request(self, request: Request, tokens_to_schedule: int) -> Sc scheduled_cached_reqs=CachedRequestData.make_empty(), num_scheduled_tokens=num_scheduled_tokens, total_num_scheduled_tokens=tokens_to_schedule, - scheduled_spec_decode_tokens={}, - scheduled_encoder_inputs={}, - num_common_prefix_blocks=[], - finished_req_ids=set(), - free_encoder_mm_hashes=[], - **self._compat_sched_output_kwargs(), + **self._extra_sched_output_kwargs(), ) def _schedule_running_requests( @@ -161,13 +155,9 @@ def _schedule_running_requests( num_computed_tokens: list[int], tokens_to_schedule: list[int], ) -> SchedulerOutput: - cached_reqs = CachedRequestData( - req_ids=req_ids, - new_token_ids=[], - new_block_ids=[], - num_computed_tokens=num_computed_tokens, - **self._compat_request_data_kwargs(), - ) + cached_reqs = CachedRequestData.make_empty() + cached_reqs.req_ids = req_ids + cached_reqs.num_computed_tokens = num_computed_tokens num_scheduled_tokens = {} total_num_scheduled_tokens = 0 @@ -180,35 +170,18 @@ def _schedule_running_requests( scheduled_cached_reqs=cached_reqs, num_scheduled_tokens=num_scheduled_tokens, total_num_scheduled_tokens=total_num_scheduled_tokens, - scheduled_spec_decode_tokens={}, - scheduled_encoder_inputs={}, - num_common_prefix_blocks=[], - finished_req_ids=set(), - free_encoder_mm_hashes=[], - **self._compat_sched_output_kwargs(), + **self._extra_sched_output_kwargs(), ) - def _compat_sched_output_kwargs(self) -> dict[str, Any]: - field_names = [field.name for field in fields(SchedulerOutput)] - kwargs: dict[str, Any] = {} - if "structured_output_request_ids" in field_names: - kwargs["structured_output_request_ids"] = {} - if "grammar_bitmask" in field_names: - kwargs["grammar_bitmask"] = None - return kwargs - - def _compat_request_data_kwargs(self) -> dict[str, Any]: - field_names = [field.name for field in fields(CachedRequestData)] - kwargs: dict[str, Any] = {} - if "resumed_req_ids" in field_names: - kwargs["resumed_req_ids"] = set() - if "all_token_ids" in field_names: - kwargs["all_token_ids"] = {} - if "num_output_tokens" in field_names: - kwargs["num_output_tokens"] = {} - if "resumed_from_preemption" in field_names: - kwargs["resumed_from_preemption"] = [] - return kwargs + def _extra_sched_output_kwargs(self) -> dict[str, Any]: + """Common kwargs for SchedulerOutput construction""" + return { + "scheduled_spec_decode_tokens": {}, + "scheduled_encoder_inputs": {}, + "num_common_prefix_blocks": [], + "finished_req_ids": set(), + "free_encoder_mm_hashes": [], + } def assert_block_tables_and_slot_mappings( self, diff --git a/tests/v1/worker/test_prefix_caching_worker.py b/tests/v1/worker/test_prefix_caching_worker.py index 1db88ebb3..1234b1ea6 100644 --- a/tests/v1/worker/test_prefix_caching_worker.py +++ b/tests/v1/worker/test_prefix_caching_worker.py @@ -4,7 +4,6 @@ from v1.worker.mock_model import InstrumentedModelRunner from spyre_util import REFERENCE_MODELS -from vllm_spyre.compat_utils import has_argument @pytest.mark.cpu @@ -51,11 +50,9 @@ def test_block_sharing_for_2_chunks( kv_cache_manager = pc_model_runner.kv_cache_manager - # compat: vLLM 0.15.0 added an argument - if has_argument(kv_cache_manager.allocate_new_blocks, "num_tokens_main_model"): - kv_cache_manager.allocate_new_blocks(request1.request.request_id, 192, 192) - else: - kv_cache_manager.allocate_new_blocks(request1.request.request_id, 192) + kv_cache_manager.allocate_new_blocks( + request1.request.request_id, num_tokens=192, num_tokens_main_model=192 + ) kv_cache_manager.cache_blocks(request1.request, 192) kv_cache_manager.free(request1.request.request_id) diff --git a/tests/v1/worker/test_spyre_input_batch.py b/tests/v1/worker/test_spyre_input_batch.py index 9d62e27d3..eb8a142f4 100644 --- a/tests/v1/worker/test_spyre_input_batch.py +++ b/tests/v1/worker/test_spyre_input_batch.py @@ -6,11 +6,8 @@ import torch from vllm.sampling_params import SamplingParams -try: - from vllm.utils import is_pin_memory_available, make_tensor_with_pad -except ImportError: - from vllm.utils.platform_utils import is_pin_memory_available - from vllm.utils.torch_utils import make_tensor_with_pad +from vllm.utils.platform_utils import is_pin_memory_available +from vllm.utils.torch_utils import make_tensor_with_pad from vllm.v1.sample.logits_processor import LogitsProcessors from vllm.v1.sample.metadata import SamplingMetadata diff --git a/uv.lock b/uv.lock index 204aa839c..e7b4ba1e4 100644 --- a/uv.lock +++ b/uv.lock @@ -2,8 +2,12 @@ version = 1 revision = 3 requires-python = ">=3.11" resolution-markers = [ - "python_full_version >= '3.12' and platform_machine != 'aarch64'", - "python_full_version >= '3.12' and platform_machine == 'aarch64'", + "python_full_version >= '3.14' and platform_machine != 'aarch64'", + "python_full_version == '3.13.*' and platform_machine != 'aarch64'", + "python_full_version == '3.12.*' and platform_machine != 'aarch64'", + "python_full_version >= '3.14' and platform_machine == 'aarch64'", + "python_full_version == '3.13.*' and platform_machine == 'aarch64'", + "python_full_version == '3.12.*' and platform_machine == 'aarch64'", "python_full_version < '3.12' and platform_machine != 'aarch64'", "python_full_version < '3.12' and platform_machine == 'aarch64'", ] @@ -21,7 +25,7 @@ overrides = [ { name = "torchaudio", marker = "sys_platform == 'never'" }, { name = "torchvision", marker = "sys_platform == 'never'" }, { name = "triton", marker = "sys_platform == 'never'" }, - { name = "vllm", marker = "platform_machine not in 's390x, ppc64le'", git = "https://github.com/vllm-project/vllm?rev=v0.16.0" }, + { name = "vllm", marker = "platform_machine not in 's390x, ppc64le'", git = "https://github.com/vllm-project/vllm?rev=v0.18.0" }, ] [[package]] @@ -1072,6 +1076,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fc/31/6a93a887617ee7deeaa602ca3d02d1c12a6cb8a742a695de5d128f5fa46a/gguf-0.17.1-py3-none-any.whl", hash = "sha256:7bc5aa7eeb1931f7d39b48fdc5b38fda6b294b9dca75cf607ac69557840a3943", size = 96224, upload-time = "2025-06-19T14:00:32.88Z" }, ] +[[package]] +name = "googleapis-common-protos" +version = "1.73.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/99/96/a0205167fa0154f4a542fd6925bdc63d039d88dab3588b875078107e6f06/googleapis_common_protos-1.73.0.tar.gz", hash = "sha256:778d07cd4fbeff84c6f7c72102f0daf98fa2bfd3fa8bea426edc545588da0b5a", size = 147323, upload-time = "2026-03-06T21:53:09.727Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/69/28/23eea8acd65972bbfe295ce3666b28ac510dfcb115fac089d3edb0feb00a/googleapis_common_protos-1.73.0-py3-none-any.whl", hash = "sha256:dfdaaa2e860f242046be561e6d6cb5c5f1541ae02cfbcb034371aadb2942b4e8", size = 297578, upload-time = "2026-03-06T21:52:33.933Z" }, +] + [[package]] name = "grpcio" version = "1.76.0" @@ -1123,19 +1139,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/19/41/0b430b01a2eb38ee887f88c1f07644a1df8e289353b78e82b37ef988fb64/grpcio-1.76.0-cp314-cp314-win_amd64.whl", hash = "sha256:922fa70ba549fce362d2e2871ab542082d66e2aaf0c19480ea453905b01f384e", size = 4834462, upload-time = "2025-10-21T16:22:39.772Z" }, ] -[[package]] -name = "grpcio-reflection" -version = "1.76.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "grpcio" }, - { name = "protobuf" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/bd/10/767f9c2719c435616141efb3371f6e158f95cdde36a34876ae1d08ba7440/grpcio_reflection-1.76.0.tar.gz", hash = "sha256:e0e7e49921c2ee951e5ddff0bdbacbd1ac1a70888beb61d567f3d01b799decb1", size = 18845, upload-time = "2025-10-21T16:28:57.776Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/32/af/6168cf4ff389deed1388b1196281c67cb36dbbf44aaee40e2bfb72ac0202/grpcio_reflection-1.76.0-py3-none-any.whl", hash = "sha256:d7c43f2047a2a9c9320a5905aa7133c677977436b5f63e6a868e507864a11c73", size = 22702, upload-time = "2025-10-21T16:27:40.846Z" }, -] - [[package]] name = "h11" version = "0.16.0" @@ -1366,6 +1369,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ec/f2/53b6e9bdd2a91202066764eaa74b572ba4dede0fe47a5a26f4de34b7541a/ijson-3.4.0.post0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:a0fedf09c0f6ffa2a99e7e7fd9c5f3caf74e655c1ee015a0797383e99382ebc3", size = 54657, upload-time = "2025-10-10T05:29:24.482Z" }, ] +[[package]] +name = "importlib-metadata" +version = "8.7.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "zipp" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f3/49/3b30cad09e7771a4982d9975a8cbf64f00d4a1ececb53297f1d9a7be1b10/importlib_metadata-8.7.1.tar.gz", hash = "sha256:49fef1ae6440c182052f407c8d34a68f72efc36db9ca90dc0113398f2fdde8bb", size = 57107, upload-time = "2025-12-21T10:00:19.278Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/5e/f8e9a1d23b9c20a551a8a02ea3637b4642e22c2626e3a13a9a29cdea99eb/importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151", size = 27865, upload-time = "2025-12-21T10:00:18.329Z" }, +] + [[package]] name = "iniconfig" version = "2.3.0" @@ -1753,7 +1768,7 @@ wheels = [ [[package]] name = "mistral-common" -version = "1.9.1" +version = "1.10.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "jsonschema" }, @@ -1765,9 +1780,9 @@ dependencies = [ { name = "tiktoken" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/db/ce/685b8127a326478e05501cb4c9ca23d1cd9f37e16c465a1e832c75aea709/mistral_common-1.9.1.tar.gz", hash = "sha256:550583d70a395c3586cfb748ffab53bd1d7c3409507f0efc0118bff30ffb26e9", size = 6338922, upload-time = "2026-02-12T10:53:41.639Z" } +sdist = { url = "https://files.pythonhosted.org/packages/a7/22/f798c1acc3f8cf32b6201b063d96867d79aa39d31dff12478739e1a78979/mistral_common-1.10.0.tar.gz", hash = "sha256:e456ff101edbdfc094039ec6c26f7d0f73356729798d628a6e6e96c3917147bc", size = 6351515, upload-time = "2026-03-13T10:13:46.683Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/ac/72/a38bb1fd9fd4d4ef990341c9dd1a7c8061f1951e10efa6d50c0a3f04eced/mistral_common-1.9.1-py3-none-any.whl", hash = "sha256:9e2b2520b6f67bac2e2bb06fcf985b7a1277b01938da2b7cda8cf0fdbfa92e91", size = 6518623, upload-time = "2026-02-12T10:53:39.457Z" }, + { url = "https://files.pythonhosted.org/packages/87/c6/1429a0a3ab40f8530492b62b52eb792266c261b22ed62aa7f25d61d531ae/mistral_common-1.10.0-py3-none-any.whl", hash = "sha256:c594d1a05202b61e8f0d867ec6064df4c5e5d492c2c2bdb6fd8fb4872c6afd8b", size = 6525284, upload-time = "2026-03-13T10:13:44.329Z" }, ] [package.optional-dependencies] @@ -1775,56 +1790,6 @@ image = [ { name = "opencv-python-headless" }, ] -[[package]] -name = "mlx" -version = "0.30.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "mlx-metal", marker = "platform_machine != 'aarch64' and sys_platform == 'darwin'" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/78/b6/dfcfffc41d832a86249715fab336dc8638c2237035287eb24af792484c53/mlx-0.30.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:794e79587a4906bdb3c5473ef936f45008eaaa609a3c498cc29a442b2c829621", size = 568664, upload-time = "2026-01-14T01:16:45.573Z" }, - { url = "https://files.pythonhosted.org/packages/22/9f/22d494b83b611380063da31c2b482db8c620f7ad6531cfcd1e11f7c35852/mlx-0.30.3-cp311-cp311-macosx_15_0_arm64.whl", hash = "sha256:472cdc6eaca8610224621a1561e8c36477eab1a2f0dd3eb49b95484d739c4605", size = 568663, upload-time = "2026-01-14T01:16:46.588Z" }, - { url = "https://files.pythonhosted.org/packages/0d/76/b6fb0500aef8e9ed65d4730d8c34b13d7a770ca863b9af363b5713a16040/mlx-0.30.3-cp311-cp311-macosx_26_0_arm64.whl", hash = "sha256:a5d82be69c7e671dc4d5855d2f6aedcb507817e5985478903ab754b642d9ba01", size = 568522, upload-time = "2026-01-14T05:52:08.334Z" }, - { url = "https://files.pythonhosted.org/packages/11/b3/e24c3a69dad0cf4404bb174c6fed0d804022da64758cd815a254e1cd0627/mlx-0.30.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:0b275168b80645a155b456e1a457a37fb5ee2c251e8fbd8db9e153351a9e2d2f", size = 569398, upload-time = "2026-01-14T01:16:49.804Z" }, - { url = "https://files.pythonhosted.org/packages/0b/87/d0804443da97a06d3439f6efb0ceffa178f530a121f0f4a6c77b39f8bfd7/mlx-0.30.3-cp312-cp312-macosx_15_0_arm64.whl", hash = "sha256:6e818de14864982e832344198240a1dafba7d3316c4eb6f1b8e43b4dd25dd2ef", size = 569396, upload-time = "2026-01-14T01:16:51.007Z" }, - { url = "https://files.pythonhosted.org/packages/cf/dc/7cdd95e4561b73fba8c86bf11293797076120400e472fe2a72ef483b6d8d/mlx-0.30.3-cp312-cp312-macosx_26_0_arm64.whl", hash = "sha256:d23b422209fd4b7ecacef59070321f8c6a122f906a5e9b6683a5fc9e1b8fcd5c", size = 569192, upload-time = "2026-01-14T05:52:09.715Z" }, - { url = "https://files.pythonhosted.org/packages/d0/22/42935d593fe82d3b98eb9d60e4620ed99703886635106f89d407c68f33bc/mlx-0.30.3-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:743fac1e4f9e8e46c8262943c643a31139c255cdb256c99ad496958215ccac1e", size = 569344, upload-time = "2026-01-14T01:16:54.847Z" }, - { url = "https://files.pythonhosted.org/packages/7d/27/f2e7a5236289d45315d0215e8553b4dd7e2faaba3bcb5025b34b25d5ab66/mlx-0.30.3-cp313-cp313-macosx_15_0_arm64.whl", hash = "sha256:3b04ae81655aa0e63a6e8f2c749de3bbce64cf5b168ae10f39ed086dfa99e7f8", size = 569345, upload-time = "2026-01-14T01:16:56.564Z" }, - { url = "https://files.pythonhosted.org/packages/01/41/06b042457f51952456e9bb46b2c6e205ab3a28fc52d6751b5787fdb762b2/mlx-0.30.3-cp313-cp313-macosx_26_0_arm64.whl", hash = "sha256:ba9b5bdb1e929cc130af72efd7f73508c0f4e526d224489af7ec1c6419564659", size = 569213, upload-time = "2026-01-14T05:52:10.86Z" }, - { url = "https://files.pythonhosted.org/packages/82/e2/6e551bd48fb350fbf0ee4cc5cd09485437d260b8f4937f22d8623e14687a/mlx-0.30.3-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:2c27fd8daaae14ca6cf407fcd236006a6e968f7708c8f61a2709116f2e754852", size = 571920, upload-time = "2026-01-14T01:16:59.683Z" }, - { url = "https://files.pythonhosted.org/packages/82/c0/561d1c9d3d12830b0e7fdcbd807585ef20909e398d4bcdbf25e4367543eb/mlx-0.30.3-cp314-cp314-macosx_15_0_arm64.whl", hash = "sha256:b755fd4ed4b6a2ae4dee3766b5a2ea52fcbe83ebd1cf018458e18b74139409f3", size = 571921, upload-time = "2026-01-14T01:17:00.868Z" }, - { url = "https://files.pythonhosted.org/packages/42/1a/fb573fc2edc22a777fa254ff5c0c886ffd2c88aeb1f21c45778ef170f990/mlx-0.30.3-cp314-cp314-macosx_26_0_arm64.whl", hash = "sha256:7e352c0369a2f7e54d4f317b434eab3333918ea9edde1c43c61d36386b6f76bf", size = 571732, upload-time = "2026-01-14T05:52:11.893Z" }, -] - -[[package]] -name = "mlx-lm" -version = "0.29.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "jinja2", marker = "platform_machine != 'aarch64'" }, - { name = "mlx", marker = "platform_machine != 'aarch64' and sys_platform == 'darwin'" }, - { name = "numpy", marker = "platform_machine != 'aarch64'" }, - { name = "protobuf", marker = "platform_machine != 'aarch64'" }, - { name = "pyyaml", marker = "platform_machine != 'aarch64'" }, - { name = "sentencepiece", marker = "platform_machine != 'aarch64'" }, - { name = "transformers", marker = "platform_machine != 'aarch64'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/e3/62/f46e1355256a114808517947f8e83ad6be310c7288c551db0fa678f47923/mlx_lm-0.29.1.tar.gz", hash = "sha256:b99180d8f33d33a077b814e550bfb2d8a59ae003d668fd1f4b3fff62a381d34b", size = 232302, upload-time = "2025-12-16T16:58:27.959Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e5/53/913099c91d384e115ea078325efd9a0bc1ea3eb3458c694b4596cbd267f2/mlx_lm-0.29.1-py3-none-any.whl", hash = "sha256:440941b3054c2a2216e97615de584cc90fa1ea874782e20699b9895721fad8dc", size = 324884, upload-time = "2025-12-16T16:58:26.36Z" }, -] - -[[package]] -name = "mlx-metal" -version = "0.30.3" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f6/63/4d8f6fefb507c028df4454dabfe8d8e0ad2961bb06510b6aca23d2d5b2be/mlx_metal-0.30.3-py3-none-macosx_14_0_arm64.whl", hash = "sha256:6276312b02353714c7c6515169569fe1c4bebe3229c8ecf1fdb375a13e78c966", size = 37716245, upload-time = "2026-01-14T01:16:34.838Z" }, - { url = "https://files.pythonhosted.org/packages/35/91/1d452e48a4bb4958844fd3bb28ae31b8de110549c009ebec5024ce27ebf3/mlx_metal-0.30.3-py3-none-macosx_15_0_arm64.whl", hash = "sha256:c096c0a3428f3f96a06220f97a36f9528b18bc05173f821eb05bc8458e723fa8", size = 37712125, upload-time = "2026-01-14T01:16:38.619Z" }, - { url = "https://files.pythonhosted.org/packages/fe/36/7a3cbca85542b5ca4faf871e35927f43aa0e3fc830ae5b699780fe723677/mlx_metal-0.30.3-py3-none-macosx_26_0_arm64.whl", hash = "sha256:69068533bd1ee8b0379ce5de57ed5fd313577a10ecab58e1332fd1ff7248a75e", size = 46488962, upload-time = "2026-01-14T05:52:04.523Z" }, -] - [[package]] name = "model-hosting-container-standards" version = "0.1.13" @@ -2215,6 +2180,132 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f2/35/0858e9e71b36948eafbc5e835874b63e515179dc3b742cbe3d76bc683439/opencv_python_headless-4.12.0.88-cp37-abi3-win_amd64.whl", hash = "sha256:86b413bdd6c6bf497832e346cd5371995de148e579b9774f8eba686dee3f5528", size = 38923559, upload-time = "2025-07-07T09:15:25.229Z" }, ] +[[package]] +name = "opentelemetry-api" +version = "1.40.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "importlib-metadata" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2c/1d/4049a9e8698361cc1a1aa03a6c59e4fa4c71e0c0f94a30f988a6876a2ae6/opentelemetry_api-1.40.0.tar.gz", hash = "sha256:159be641c0b04d11e9ecd576906462773eb97ae1b657730f0ecf64d32071569f", size = 70851, upload-time = "2026-03-04T14:17:21.555Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5f/bf/93795954016c522008da367da292adceed71cca6ee1717e1d64c83089099/opentelemetry_api-1.40.0-py3-none-any.whl", hash = "sha256:82dd69331ae74b06f6a874704be0cfaa49a1650e1537d4a813b86ecef7d0ecf9", size = 68676, upload-time = "2026-03-04T14:17:01.24Z" }, +] + +[[package]] +name = "opentelemetry-exporter-otlp" +version = "1.40.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-exporter-otlp-proto-grpc" }, + { name = "opentelemetry-exporter-otlp-proto-http" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d0/37/b6708e0eff5c5fb9aba2e0ea09f7f3bcbfd12a592d2a780241b5f6014df7/opentelemetry_exporter_otlp-1.40.0.tar.gz", hash = "sha256:7caa0870b95e2fcb59d64e16e2b639ecffb07771b6cd0000b5d12e5e4fef765a", size = 6152, upload-time = "2026-03-04T14:17:23.235Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2d/fc/aea77c28d9f3ffef2fdafdc3f4a235aee4091d262ddabd25882f47ce5c5f/opentelemetry_exporter_otlp-1.40.0-py3-none-any.whl", hash = "sha256:48c87e539ec9afb30dc443775a1334cc5487de2f72a770a4c00b1610bf6c697d", size = 7023, upload-time = "2026-03-04T14:17:03.612Z" }, +] + +[[package]] +name = "opentelemetry-exporter-otlp-proto-common" +version = "1.40.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-proto" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/51/bc/1559d46557fe6eca0b46c88d4c2676285f1f3be2e8d06bb5d15fbffc814a/opentelemetry_exporter_otlp_proto_common-1.40.0.tar.gz", hash = "sha256:1cbee86a4064790b362a86601ee7934f368b81cd4cc2f2e163902a6e7818a0fa", size = 20416, upload-time = "2026-03-04T14:17:23.801Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8b/ca/8f122055c97a932311a3f640273f084e738008933503d0c2563cd5d591fc/opentelemetry_exporter_otlp_proto_common-1.40.0-py3-none-any.whl", hash = "sha256:7081ff453835a82417bf38dccf122c827c3cbc94f2079b03bba02a3165f25149", size = 18369, upload-time = "2026-03-04T14:17:04.796Z" }, +] + +[[package]] +name = "opentelemetry-exporter-otlp-proto-grpc" +version = "1.40.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "googleapis-common-protos" }, + { name = "grpcio" }, + { name = "opentelemetry-api" }, + { name = "opentelemetry-exporter-otlp-proto-common" }, + { name = "opentelemetry-proto" }, + { name = "opentelemetry-sdk" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8f/7f/b9e60435cfcc7590fa87436edad6822240dddbc184643a2a005301cc31f4/opentelemetry_exporter_otlp_proto_grpc-1.40.0.tar.gz", hash = "sha256:bd4015183e40b635b3dab8da528b27161ba83bf4ef545776b196f0fb4ec47740", size = 25759, upload-time = "2026-03-04T14:17:24.4Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/96/6f/7ee0980afcbdcd2d40362da16f7f9796bd083bf7f0b8e038abfbc0300f5d/opentelemetry_exporter_otlp_proto_grpc-1.40.0-py3-none-any.whl", hash = "sha256:2aa0ca53483fe0cf6405087a7491472b70335bc5c7944378a0a8e72e86995c52", size = 20304, upload-time = "2026-03-04T14:17:05.942Z" }, +] + +[[package]] +name = "opentelemetry-exporter-otlp-proto-http" +version = "1.40.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "googleapis-common-protos" }, + { name = "opentelemetry-api" }, + { name = "opentelemetry-exporter-otlp-proto-common" }, + { name = "opentelemetry-proto" }, + { name = "opentelemetry-sdk" }, + { name = "requests" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2e/fa/73d50e2c15c56be4d000c98e24221d494674b0cc95524e2a8cb3856d95a4/opentelemetry_exporter_otlp_proto_http-1.40.0.tar.gz", hash = "sha256:db48f5e0f33217588bbc00274a31517ba830da576e59503507c839b38fa0869c", size = 17772, upload-time = "2026-03-04T14:17:25.324Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a0/3a/8865d6754e61c9fb170cdd530a124a53769ee5f740236064816eb0ca7301/opentelemetry_exporter_otlp_proto_http-1.40.0-py3-none-any.whl", hash = "sha256:a8d1dab28f504c5d96577d6509f80a8150e44e8f45f82cdbe0e34c99ab040069", size = 19960, upload-time = "2026-03-04T14:17:07.153Z" }, +] + +[[package]] +name = "opentelemetry-proto" +version = "1.40.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/4c/77/dd38991db037fdfce45849491cb61de5ab000f49824a00230afb112a4392/opentelemetry_proto-1.40.0.tar.gz", hash = "sha256:03f639ca129ba513f5819810f5b1f42bcb371391405d99c168fe6937c62febcd", size = 45667, upload-time = "2026-03-04T14:17:31.194Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b9/b2/189b2577dde745b15625b3214302605b1353436219d42b7912e77fa8dc24/opentelemetry_proto-1.40.0-py3-none-any.whl", hash = "sha256:266c4385d88923a23d63e353e9761af0f47a6ed0d486979777fe4de59dc9b25f", size = 72073, upload-time = "2026-03-04T14:17:16.673Z" }, +] + +[[package]] +name = "opentelemetry-sdk" +version = "1.40.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "opentelemetry-semantic-conventions" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/58/fd/3c3125b20ba18ce2155ba9ea74acb0ae5d25f8cd39cfd37455601b7955cc/opentelemetry_sdk-1.40.0.tar.gz", hash = "sha256:18e9f5ec20d859d268c7cb3c5198c8d105d073714db3de50b593b8c1345a48f2", size = 184252, upload-time = "2026-03-04T14:17:31.87Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2c/c5/6a852903d8bfac758c6dc6e9a68b015d3c33f2f1be5e9591e0f4b69c7e0a/opentelemetry_sdk-1.40.0-py3-none-any.whl", hash = "sha256:787d2154a71f4b3d81f20524a8ce061b7db667d24e46753f32a7bc48f1c1f3f1", size = 141951, upload-time = "2026-03-04T14:17:17.961Z" }, +] + +[[package]] +name = "opentelemetry-semantic-conventions" +version = "0.61b0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6d/c0/4ae7973f3c2cfd2b6e321f1675626f0dab0a97027cc7a297474c9c8f3d04/opentelemetry_semantic_conventions-0.61b0.tar.gz", hash = "sha256:072f65473c5d7c6dc0355b27d6c9d1a679d63b6d4b4b16a9773062cb7e31192a", size = 145755, upload-time = "2026-03-04T14:17:32.664Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b2/37/cc6a55e448deaa9b27377d087da8615a3416d8ad523d5960b78dbeadd02a/opentelemetry_semantic_conventions-0.61b0-py3-none-any.whl", hash = "sha256:fa530a96be229795f8cef353739b618148b0fe2b4b3f005e60e262926c4d38e2", size = 231621, upload-time = "2026-03-04T14:17:19.33Z" }, +] + +[[package]] +name = "opentelemetry-semantic-conventions-ai" +version = "0.5.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-sdk" }, + { name = "opentelemetry-semantic-conventions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c6/0b/0ff2326417a9eed74ff6717629075246098dcbda067a62fd73095139babb/opentelemetry_semantic_conventions_ai-0.5.0.tar.gz", hash = "sha256:64c21c5ae0c971ee2ecab986d66e93bb50e616b52e18a1284e118a323a9e6869", size = 25202, upload-time = "2026-03-20T08:47:05.751Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/18/35fec29ed6e49bcbbe629b790cc0deb5bb58da9caceee29b39b54d3d7f47/opentelemetry_semantic_conventions_ai-0.5.0-py3-none-any.whl", hash = "sha256:8727f474f590138f5e4937945378878a5b2f4ea82bc24ffd93265ca9fbdc48a4", size = 9983, upload-time = "2026-03-20T08:47:06.843Z" }, +] + [[package]] name = "outlines-core" version = "0.2.11" @@ -4273,8 +4364,8 @@ wheels = [ [[package]] name = "vllm" -version = "0.16.0" -source = { git = "https://github.com/vllm-project/vllm?rev=v0.16.0#89a77b10846fd96273cce78d86d2556ea582d26e" } +version = "0.18.0+cpu" +source = { git = "https://github.com/vllm-project/vllm?rev=v0.18.0#bcf2be96120005e9aea171927f85055a6a5c0cf6" } dependencies = [ { name = "aiohttp" }, { name = "anthropic" }, @@ -4289,8 +4380,6 @@ dependencies = [ { name = "fastapi", extra = ["standard"] }, { name = "filelock" }, { name = "gguf" }, - { name = "grpcio" }, - { name = "grpcio-reflection" }, { name = "ijson" }, { name = "intel-openmp", marker = "platform_machine == 'x86_64'" }, { name = "lark" }, @@ -4306,6 +4395,10 @@ dependencies = [ { name = "openai" }, { name = "openai-harmony" }, { name = "opencv-python-headless" }, + { name = "opentelemetry-api" }, + { name = "opentelemetry-exporter-otlp" }, + { name = "opentelemetry-sdk" }, + { name = "opentelemetry-semantic-conventions-ai" }, { name = "outlines-core" }, { name = "partial-json-parser" }, { name = "pillow" }, @@ -4362,7 +4455,7 @@ dev = [ requires-dist = [ { name = "fms-model-optimizer", extras = ["fp8"], specifier = ">=0.8.0" }, { name = "ibm-fms", specifier = ">=1.7.0,<2.0" }, - { name = "vllm", git = "https://github.com/vllm-project/vllm?rev=v0.16.0" }, + { name = "vllm", git = "https://github.com/vllm-project/vllm?rev=v0.18.0" }, ] [package.metadata.requires-dev] @@ -4534,10 +4627,9 @@ wheels = [ [[package]] name = "xgrammar" -version = "0.1.29" +version = "0.1.32" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "mlx-lm", marker = "platform_machine == 'arm64' and sys_platform == 'darwin'" }, { name = "numpy" }, { name = "pydantic" }, { name = "torch", marker = "sys_platform == 'never'" }, @@ -4545,21 +4637,32 @@ dependencies = [ { name = "triton", marker = "sys_platform == 'never'" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/02/a3/70dbe3ffd331a1e7e1ad5a95690a4086e6c7cdb8089f5c7eda712219ccec/xgrammar-0.1.29.tar.gz", hash = "sha256:cf195afa81b489eebf35d4c6f37f27136d05420739ab4a6f7f065c938d7e4baa", size = 2321317, upload-time = "2025-12-19T08:23:54.53Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c6/de/88832fac40962fd0d4703bd4ba84598b06b8408bdc4a6722744f363f68a6/xgrammar-0.1.29-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:d2a7eef1b75b8d31b868d5c79855622aad203275ff267fc0e0ef77dd91906cfe", size = 16008004, upload-time = "2025-12-19T08:23:11.998Z" }, - { url = "https://files.pythonhosted.org/packages/76/f6/4d22eec5305657430955442077306bc6ed85becc564116165d4b3a7049ad/xgrammar-0.1.29-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4af7f6ce2b2c6295b936b7cbda09f78e33f2c492a139cd64560f5d8d0fe967ed", size = 17914326, upload-time = "2025-12-19T08:23:14.43Z" }, - { url = "https://files.pythonhosted.org/packages/87/0b/b5e5c99ce13a9d378a940cda07c5a08b50cc7efb66936c6ac8fa8232a0d5/xgrammar-0.1.29-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51bcfd63bd48a0b26209ffd2143a42067518559355ec9e4e574cef2ae74fac7c", size = 34699408, upload-time = "2025-12-19T08:23:16.906Z" }, - { url = "https://files.pythonhosted.org/packages/a3/a0/4ebc1b3f5af79a3f73d0566034758f3fbcd9c64174646314a9a6f7cc1d27/xgrammar-0.1.29-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e27b50cf8c565845295a8263a4a0790c00a7c1fd783e76222fc0f575654d6f56", size = 34903461, upload-time = "2025-12-19T08:23:19.556Z" }, - { url = "https://files.pythonhosted.org/packages/77/21/f6b3978dc9761bbfbbb153d33441206ce2253efa271d8e2d8b6b210d2bd7/xgrammar-0.1.29-cp311-cp311-win_amd64.whl", hash = "sha256:c9f8ea76bcf41b48168974b509b1546d2bee289ff1b20c68bc97434c1ea6e49a", size = 5928633, upload-time = "2025-12-19T08:23:21.67Z" }, - { url = "https://files.pythonhosted.org/packages/c1/d8/fb282fc78be6e9bbefb5cb389f66b22e4efd6ae14f06234f599651620da5/xgrammar-0.1.29-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:d992a3cee7594bbdaa64ae59f90da5ce21c5fe654719df3816014289ada6f04d", size = 16007376, upload-time = "2025-12-19T08:23:23.634Z" }, - { url = "https://files.pythonhosted.org/packages/82/a7/2c9767620ee50f2f40f1eb95e55a3a29e1a0670f087ee6dc1bc1c887b906/xgrammar-0.1.29-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1bbdf02e45cfa8614218ba01ca7952d375f8bc1c13884e3d04daa4b54180cbc2", size = 17913535, upload-time = "2025-12-19T08:23:26.02Z" }, - { url = "https://files.pythonhosted.org/packages/57/94/18793c64bf0368075a34c06e196bf002f1e6ab0aee332268f44e8d356d5a/xgrammar-0.1.29-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6eb370a16b27a683e5f2b9e429ab41440c69977d4a504849ed61831b94cc704c", size = 34705239, upload-time = "2025-12-19T08:23:28.369Z" }, - { url = "https://files.pythonhosted.org/packages/3e/da/4c14e3e00be698009b52700f15326a23272b4b00475939b6acc86b151188/xgrammar-0.1.29-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:79e6e4f5cd33be77418cf91efc482f2b3d773d309891224383bc8a4948ad7b07", size = 34906135, upload-time = "2025-12-19T08:23:30.838Z" }, - { url = "https://files.pythonhosted.org/packages/22/d8/34423997f48627cef3b74cc894d9dfcaacae02941c06237ac5f3196406a7/xgrammar-0.1.29-cp312-cp312-win_amd64.whl", hash = "sha256:39bdfadedbce34599835486164fa80ba00248c6c75ad91f3843db90ef37e037f", size = 5928381, upload-time = "2025-12-19T08:23:33.428Z" }, - { url = "https://files.pythonhosted.org/packages/2c/ef/8a4b4cb10fc996c0a25c9bf5613aaf5a86114291a9a4003e43605cab42bf/xgrammar-0.1.29-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:fedf21e447ef646f23a6e2d11877c0812d55965dcf8c0aa9b0f32590c9b6e22a", size = 17913609, upload-time = "2025-12-19T08:23:36.06Z" }, - { url = "https://files.pythonhosted.org/packages/e9/c5/e4965c9921e7bb6061f246ae7f8c7b9b1dfc21262248100c2f9b398b361e/xgrammar-0.1.29-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eb22aea775971f7d8c4d0e193257ebeb71b68acd9d36af3331ca5fd4d9a46991", size = 34904126, upload-time = "2025-12-19T08:23:38.335Z" }, - { url = "https://files.pythonhosted.org/packages/09/26/641d7ee1a59e526aa94be980c485f899088d09dd1af517a2e1d0e85853bc/xgrammar-0.1.29-cp313-cp313-win_amd64.whl", hash = "sha256:12e6d63e892e9da8d088569dd629af58a5eafd909dc58788d499c4fd74bcd2a1", size = 5928450, upload-time = "2025-12-19T08:23:40.667Z" }, +sdist = { url = "https://files.pythonhosted.org/packages/99/6a/d51b44fc0b43e2d4adae42b6a17fe9ee49e177d6d768be739ed7dec7b57e/xgrammar-0.1.32.tar.gz", hash = "sha256:5d424d52779ca2d3ccaf72f2289d6519efe308e933d0d3fc3c292c780825bb12", size = 2365047, upload-time = "2026-03-04T12:01:52.544Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/28/cd/4b5e67c8030b626a1a00b65b4d149b1b031c885eef86d4e5fa296f6ec72e/xgrammar-0.1.32-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:51b41c47785aa198d19f8d056b394f75b4421deab88c415568f9c588b1f7e238", size = 18425822, upload-time = "2026-03-04T12:00:23.356Z" }, + { url = "https://files.pythonhosted.org/packages/5c/c0/94fbc45642e733a9ad4a9f3f7300a1a06b265f8657af4d6a56acd8cf00c4/xgrammar-0.1.32-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d7030192cb1d8579699f1f72fd14d31347a402611aab98a2da6a04c3de07e917", size = 20582669, upload-time = "2026-03-04T12:00:26.463Z" }, + { url = "https://files.pythonhosted.org/packages/90/ea/2f4c8616d8ed0b5a3eb4e417b4987ad5a8d9dd9336ed966a8d48ffd45907/xgrammar-0.1.32-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a332c0364f665b410a6cfc2ada155c3a6ede430e385ac431015e31735a64fec3", size = 37682948, upload-time = "2026-03-04T12:00:29.814Z" }, + { url = "https://files.pythonhosted.org/packages/ac/ae/b9108fadd354ae776c1e7ecd26890a13ac8a30367f9fe8110443aedc4e6a/xgrammar-0.1.32-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5b8ad132d0fcf3a51dc054ecb0dc9808566b302122de6edaac7b4aca460adbec", size = 37709617, upload-time = "2026-03-04T12:00:33.068Z" }, + { url = "https://files.pythonhosted.org/packages/9d/48/0096bd1f3b460eac48faaecf79418ea3172269dccf37968e78dff5114faf/xgrammar-0.1.32-cp311-cp311-win_amd64.whl", hash = "sha256:b8b1ca6d3f3c2842660458660e494aaf0a6745f1b07ae74e4c2230ab4ff70c11", size = 6632722, upload-time = "2026-03-04T12:00:36.133Z" }, + { url = "https://files.pythonhosted.org/packages/9f/fd/5e771276fa090e35eaf1cbfdede24b9d93d6bbd2e99cd4f8d558f381fdee/xgrammar-0.1.32-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:9b78d32265f096e5567ab52c72b681855cf473481a48a1e7e6d97d414ba30b82", size = 18425090, upload-time = "2026-03-04T12:00:38.5Z" }, + { url = "https://files.pythonhosted.org/packages/31/66/f06745755ef0750f43955cf679b4bd8bd88ac8bfab760f020225c192884f/xgrammar-0.1.32-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:23eacaf826c3aeebca0d91fc271417d9d96e157af2bacf6f14277297af7917ef", size = 20582048, upload-time = "2026-03-04T12:00:42.369Z" }, + { url = "https://files.pythonhosted.org/packages/79/29/3b0306800ccabce8f565123a5b97432dee43822c30142085d9b13b43f166/xgrammar-0.1.32-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f9a637d4e0c541149e0d409c24f4ec79cd74d87508ee6a17a7e64a9b9c0cf56f", size = 37680849, upload-time = "2026-03-04T12:00:46.712Z" }, + { url = "https://files.pythonhosted.org/packages/69/62/65e664d861cdadf2d788c03dd8fe67f1faaa7bd4bd2317a2ab850aebee20/xgrammar-0.1.32-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f96c7a4fcbd68e18b13cb3b6ed5d24b5326b256933f476bdaf2cc8e609c228db", size = 37711100, upload-time = "2026-03-04T12:00:50.188Z" }, + { url = "https://files.pythonhosted.org/packages/80/43/05f27a1739209eb590772f867f3f48e6db0a36f376d85db4e68f49aee799/xgrammar-0.1.32-cp312-cp312-win_amd64.whl", hash = "sha256:ba6e08c385cce53eda8e9b3bbfba63f100ba3dcb76fa0692a65921a36b20ad0a", size = 6632259, upload-time = "2026-03-04T12:00:53.184Z" }, + { url = "https://files.pythonhosted.org/packages/7b/58/b4ff220b28d7d6a4ccf5c229ddbabc7018cd9544356ac8a161086e7a7a0e/xgrammar-0.1.32-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4addb8f5d5699e7df7fca6d299a91b3ef1ad799811c0ab7050d6f96d754c9c21", size = 20582005, upload-time = "2026-03-04T12:00:55.089Z" }, + { url = "https://files.pythonhosted.org/packages/83/95/9fedafd412af05b1d61859c52fd9d26abc9a167fab66bdad53f832da0956/xgrammar-0.1.32-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:028f8d6a105d06549faee0afbebfaada90aa1941c081dcc88f3d5ef373dad934", size = 37680882, upload-time = "2026-03-04T12:00:59.456Z" }, + { url = "https://files.pythonhosted.org/packages/0a/21/a9d328ae9ff4e794281995de3a1f8065517bb9bef70f099ab24f7743b3be/xgrammar-0.1.32-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0c0150c50eb3a56a35d6f0c0af0bce0f113ec5f84f7918bfd46b49e25ecf7fb5", size = 37710862, upload-time = "2026-03-04T12:01:02.739Z" }, + { url = "https://files.pythonhosted.org/packages/28/dc/8ecf71ad1e9c96fd941d2e9a852e184054596eeb1799de8b2e172eaf705e/xgrammar-0.1.32-cp313-cp313-win_amd64.whl", hash = "sha256:e1072d764705c8e87df6136ce3419f96ab3fd423d85f58c2d81c13a647b78894", size = 6632312, upload-time = "2026-03-04T12:01:05.474Z" }, + { url = "https://files.pythonhosted.org/packages/39/5d/79d524f302ab257f0b6856946e387783f688035360f0c8873b457700e391/xgrammar-0.1.32-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:4e6015ad2b941a292562f68b9a2ee1ddae8e28df840dc39232dcc7007fc6f606", size = 18432652, upload-time = "2026-03-04T12:01:07.366Z" }, + { url = "https://files.pythonhosted.org/packages/1f/4d/94bdf71b03f94b16265e956d9277fc182384561409b25ede79614fe1fa32/xgrammar-0.1.32-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:8e8da3e7fc194e098b760bacb2b60ad2227cac70d7be5d2e4f7025b1c360c43d", size = 20582170, upload-time = "2026-03-04T12:01:10.012Z" }, + { url = "https://files.pythonhosted.org/packages/c8/80/30f9dcea0574c46a20cdecf91ab35f882fa4e7ba028ce5ebfeb3afe1d5bb/xgrammar-0.1.32-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6588cfd9754f2c46846276a2e8284a46582a74886d7aaea02cf6ce63ccc397ce", size = 37680819, upload-time = "2026-03-04T12:01:12.958Z" }, + { url = "https://files.pythonhosted.org/packages/dc/bc/4ff87fbf59a4abd272325d3489ac5aa599bacd8b01ea09fec2ca84eece14/xgrammar-0.1.32-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7f740ba83b69abb423167a5d5b13a9fcde89747220e191f6a004fae4a834311f", size = 37711054, upload-time = "2026-03-04T12:01:17.469Z" }, + { url = "https://files.pythonhosted.org/packages/62/fa/16b91df8a50798980b60b2c4c800280a3bed50d6a18e55ef6958d30d0faa/xgrammar-0.1.32-cp314-cp314-win_amd64.whl", hash = "sha256:9c0769c3468bd67495c28a03dc5ce3948d83cddaf0a59c6d992b12fc683a1c3e", size = 6718108, upload-time = "2026-03-04T12:01:20.222Z" }, + { url = "https://files.pythonhosted.org/packages/48/7d/78373114c3ceb5e82cb98bbbde20191477ff5b219f941aa7a535c94bcab8/xgrammar-0.1.32-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:da8339b38e96d105868c14b2cb2df4b7c83d7a49f8539c74fd7470d61043e5b1", size = 18435039, upload-time = "2026-03-04T12:01:22.458Z" }, + { url = "https://files.pythonhosted.org/packages/61/64/676553d63f74b65887e3ebad86468f557fe0a0ff6373186d300272c7776c/xgrammar-0.1.32-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b938a9096bccc06c30abb5304b2b39c272a924ca002e19421cce5e6ee9670f4f", size = 20584105, upload-time = "2026-03-04T12:01:26.08Z" }, + { url = "https://files.pythonhosted.org/packages/67/dd/fa6ce458f7b9ab694458683064de08c07509d17c148241000b3d97291383/xgrammar-0.1.32-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fe2ee94080d77b84e38cb6643b75a6ca29cf814a3e5d5da8e1176eae4034d662", size = 37683911, upload-time = "2026-03-04T12:01:29.661Z" }, + { url = "https://files.pythonhosted.org/packages/80/ba/98675e76c481832a6cbe51aba2b1bf4a9593b5352f9a60c07c5d209e184a/xgrammar-0.1.32-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:70ddbf7216e1e7ec96134a2474a6b84d2b14439a6f6379e079b7c557131be41d", size = 37706596, upload-time = "2026-03-04T12:01:33.264Z" }, + { url = "https://files.pythonhosted.org/packages/5d/b8/aeafad38d44af75e31101752bcd8fa2a9f4f6b702861813bc7edcfbca266/xgrammar-0.1.32-cp314-cp314t-win_amd64.whl", hash = "sha256:4f68e591a6e9e121d5f03821ab2c44a7af092dc8bf7c9cde1a776871c6bd4dc5", size = 6723286, upload-time = "2026-03-04T12:01:35.866Z" }, ] [[package]] @@ -4774,3 +4877,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/48/b7/503c98092fb3b344a179579f55814b613c1fbb1c23b3ec14a7b008a66a6e/yarl-1.22.0-cp314-cp314t-win_arm64.whl", hash = "sha256:9f6d73c1436b934e3f01df1e1b21ff765cd1d28c77dfb9ace207f746d4610ee1", size = 85171, upload-time = "2025-10-06T14:12:16.935Z" }, { url = "https://files.pythonhosted.org/packages/73/ae/b48f95715333080afb75a4504487cbe142cae1268afc482d06692d605ae6/yarl-1.22.0-py3-none-any.whl", hash = "sha256:1380560bdba02b6b6c90de54133c81c9f2a453dee9912fe58c1dcced1edb7cff", size = 46814, upload-time = "2025-10-06T14:12:53.872Z" }, ] + +[[package]] +name = "zipp" +version = "3.23.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" }, +] diff --git a/vllm_spyre/platform.py b/vllm_spyre/platform.py index fa6e1ae67..46b0e22ec 100644 --- a/vllm_spyre/platform.py +++ b/vllm_spyre/platform.py @@ -17,12 +17,8 @@ import torch from vllm.logger import init_logger +from vllm.utils.argparse_utils import FlexibleArgumentParser -try: - # pre 0.11.1 compatibility - from vllm.utils import FlexibleArgumentParser # ty: ignore[unresolved-import] -except ImportError: - from vllm.utils.argparse_utils import FlexibleArgumentParser if TYPE_CHECKING: # NB: We can't eagerly import many things from vllm since vllm.config @@ -30,24 +26,15 @@ from vllm.config import ModelConfig, VllmConfig from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams - from vllm.inputs import ProcessorInputs, PromptType, TokenInputs - - # Try to import new types (0.16.0+) - try: - from vllm.renderers.inputs import DictPrompt, TokPrompt - except ImportError: - DictPrompt = None # type: ignore - TokPrompt = None # type: ignore + from vllm.inputs import ProcessorInputs, TokenInputs + else: ModelConfig = None VllmConfig = None SamplingParams = None PoolingParams = None ProcessorInputs = None - PromptType = None TokenInputs = None - DictPrompt = None - TokPrompt = None from vllm.platforms import Platform, PlatformEnum import vllm_spyre.envs as envs_spyre @@ -103,7 +90,8 @@ def get_device_name(cls, device_id: int = 0) -> str: @classmethod def import_kernels(cls) -> None: - pass # suppress warning + # Workaround torch.accelerator.empty_cache for torch 2.7.1 and vllm v0.18.0 compatibility + setattr(torch.accelerator, "empty_cache", lambda: None) # noqa @classmethod def is_async_output_supported(cls, enforce_eager: bool | None) -> bool: @@ -319,6 +307,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: "set `--max-num-batched-tokens` to a number that satisfies " "this constraint." ) + if hasattr(cache_config, "user_specified_block_size"): + cache_config.user_specified_block_size = True logger.info( "Configurations for Spyre. max_model_len=%d, max_num_seqs=%d, block_size=%d, " @@ -436,9 +426,8 @@ def supports_v1(cls, model_config: ModelConfig) -> bool: @classmethod def validate_request( cls, - prompt: "PromptType | DictPrompt | TokPrompt", + processed_inputs: "ProcessorInputs", params: "SamplingParams | PoolingParams", - processed_inputs: "ProcessorInputs | None" = None, ) -> None: """Raises if this request is unsupported on this platform""" @@ -464,18 +453,12 @@ def validate_request( ) params.structured_outputs = None - if isinstance(prompt, dict) and "prompt_token_ids" in prompt: - prompt_len = len(prompt["prompt_token_ids"]) # ty: ignore - elif processed_inputs is not None: - if "encoder" in processed_inputs: - raise ValueError("Encoder-decoder models not supported ") - if "prompt_token_ids" not in processed_inputs: - # Can't do any extra validation on embedding-only inputs - return - prompt_len = len(cast(TokenInputs, processed_inputs)["prompt_token_ids"]) - else: - # We need a prompt length to do any validation here + if "encoder_prompt" in processed_inputs: + raise ValueError("Encoder-decoder models not supported ") + if "prompt_token_ids" not in processed_inputs: + # Can't do any extra validation on embedding-only inputs return + prompt_len = len(cast(TokenInputs, processed_inputs)["prompt_token_ids"]) max_tokens = 0 if params is not None and params.max_tokens is not None: diff --git a/vllm_spyre/v1/core/scheduler.py b/vllm_spyre/v1/core/scheduler.py index 164ee227e..5077dc755 100644 --- a/vllm_spyre/v1/core/scheduler.py +++ b/vllm_spyre/v1/core/scheduler.py @@ -788,21 +788,27 @@ def check_batch_tkv_limit_cp( def finish_requests( self, - request_ids: Union[str, Iterable[str]], - finished_status, - ) -> None: + request_ids: Union[str, Iterable[str], None], + finished_status: RequestStatus, + ) -> list[tuple[str, int]]: """Handles removing finished requests from ongoing_prefills""" if isinstance(request_ids, str): request_ids = (request_ids,) - # first defer to vLLM scheduler where validation is handled - super(SpyreScheduler, self).finish_requests( + # first defer to vLLM scheduler + # validates the input requests and generates the output + aborted_requests = super(SpyreScheduler, self).finish_requests( request_ids=request_ids, finished_status=finished_status ) - self.ongoing_prefills = [ - r for r in self.ongoing_prefills if r.request_id not in request_ids - ] + # request_ids None means all requests are finished + self.ongoing_prefills = ( + [] + if request_ids is None + else [r for r in self.ongoing_prefills if r.request_id not in request_ids] + ) + + return aborted_requests def make_stats(self, *args, **kwargs) -> SchedulerStats | None: """Update the scheduler stats from the base scheduler. diff --git a/vllm_spyre/v1/metrics/stats_logger.py b/vllm_spyre/v1/metrics/stats_logger.py index 777b86dd0..8898d4a97 100644 --- a/vllm_spyre/v1/metrics/stats_logger.py +++ b/vllm_spyre/v1/metrics/stats_logger.py @@ -9,17 +9,15 @@ from vllm.logger import init_logger from vllm.v1.engine import async_llm, llm_engine from vllm.v1.metrics.loggers import StatLoggerBase, StatLoggerManager -from vllm.v1.metrics.stats import FinishedRequestStats, IterationStats, SchedulerStats +from vllm.v1.metrics.stats import ( + FinishedRequestStats, + IterationStats, + MultiModalCacheStats, + SchedulerStats, +) from vllm_spyre import envs as envs_spyre -try: - from vllm.v1.metrics.stats import MultiModalCacheStats -except ImportError: - # compatibility for vllm pre 0.11.1 - class MultiModalCacheStats: - pass - logger = init_logger(__name__) diff --git a/vllm_spyre/v1/worker/spyre_input_batch.py b/vllm_spyre/v1/worker/spyre_input_batch.py index b36764d48..528b10a7c 100644 --- a/vllm_spyre/v1/worker/spyre_input_batch.py +++ b/vllm_spyre/v1/worker/spyre_input_batch.py @@ -5,7 +5,7 @@ from abc import abstractmethod from dataclasses import dataclass, field -from typing import Any, Generic, TypeVar, cast +from typing import Generic, TypeVar, cast import numpy as np import torch @@ -20,7 +20,6 @@ from vllm.v1.sample.metadata import SamplingMetadata from vllm_spyre.v1.sample.spyre_logits_processor import LogitProcessorWrapper -from vllm_spyre.compat_utils import has_argument @dataclass @@ -742,12 +741,9 @@ def make_pooling_metadata(self) -> PoolingMetadata: assert len(self.requests_ids) == len(self.pooling_params) pooling_params = [self.pooling_params[req_id] for req_id in self.requests_ids] - kwargs: dict[str, Any] = {} - if has_argument(PoolingMetadata, "pooling_states"): - kwargs["pooling_states"] = [] return PoolingMetadata( prompt_lens=torch.from_numpy(self._get_num_prompt_tokens()).to(self.device), prompt_token_ids=prompt_token_ids, pooling_params=pooling_params, - **kwargs, + pooling_states=[], ) diff --git a/vllm_spyre/v1/worker/spyre_model_runner.py b/vllm_spyre/v1/worker/spyre_model_runner.py index 728301295..8f41a447e 100644 --- a/vllm_spyre/v1/worker/spyre_model_runner.py +++ b/vllm_spyre/v1/worker/spyre_model_runner.py @@ -15,12 +15,8 @@ from vllm.logger import init_logger from vllm.sampling_params import SamplingType -try: - # pre 0.11.1 compatibility - from vllm.utils import get_hash_fn_by_name, is_pin_memory_available # ty: ignore[unresolved-import] -except ImportError: - from vllm.utils.platform_utils import is_pin_memory_available - from vllm.utils.hashing import get_hash_fn_by_name +from vllm.utils.platform_utils import is_pin_memory_available +from vllm.utils.hashing import get_hash_fn_by_name from vllm.v1.core.block_pool import BlockPool from vllm.v1.core.kv_cache_utils import KVCacheBlock, get_request_block_hasher, init_none_hash @@ -35,7 +31,6 @@ import vllm_spyre.envs as envs_spyre import vllm_spyre.utils as utils_spyre -from vllm_spyre.compat_utils import has_argument from vllm_spyre.model_executor.model_loader.spyre import ( BACKEND_LIST, SpyreAttentionMetadata, @@ -955,14 +950,11 @@ def _set_blocks(self, num_blocks: int) -> None: self.kv_cache_manager = self._make_kv_cache_manager() def _make_block_pool(self) -> BlockPool: - kwargs = {} - if has_argument(BlockPool, "hash_block_size"): - kwargs["hash_block_size"] = self.block_size return BlockPool( num_gpu_blocks=self.n_blocks + 1, enable_caching=self.enable_prefix_caching, enable_kv_cache_events=False, - **kwargs, + hash_block_size=self.block_size, ) def _make_kv_cache_manager(self) -> FullAttentionManager: @@ -974,33 +966,24 @@ def _make_kv_cache_manager(self) -> FullAttentionManager: dtype=torch.float16, ) - # Enable_caching parameter added in vllm v0.14.0 - kwargs = { - "kv_cache_spec": self._attn_spec, - "block_pool": self.block_pool, + return FullAttentionManager( + kv_cache_spec=self._attn_spec, + block_pool=self.block_pool, # Currently don't support models with more than one # attention type, e.g. full and sliding window, so # there is only one group. - "kv_cache_group_id": 0, + kv_cache_group_id=0, # We don't support DCP # https://docs.vllm.ai/en/latest/serving/context_parallel_deployment/#decode-context-parallel - "dcp_world_size": 1, - } - - # Conditionally add param for vLLM >= 0.14.0 - if has_argument(FullAttentionManager.__init__, "enable_caching"): - kwargs["enable_caching"] = self.enable_prefix_caching - - return FullAttentionManager(**kwargs) # ty: ignore[invalid-argument-type] + dcp_world_size=1, + enable_caching=self.enable_prefix_caching, + ) # ty: ignore[invalid-argument-type] def _allocate_new_blocks_wrapper(self, req_id: str, num_tokens: int): - """Backwards compatibility for change to interface in v0.15.0""" - kwargs: dict[str, Any] = { - "num_tokens": num_tokens, - } - if has_argument(self.kv_cache_manager.allocate_new_blocks, "num_tokens_main_model"): - kwargs["num_tokens_main_model"] = num_tokens - return self.kv_cache_manager.allocate_new_blocks(req_id, **kwargs) + """Wrapper for allocating new blocks""" + return self.kv_cache_manager.allocate_new_blocks( + req_id, num_tokens=num_tokens, num_tokens_main_model=num_tokens + ) def _get_blocks(self, request_id: str) -> list[KVCacheBlock]: return self.kv_cache_manager.req_to_blocks[request_id] @@ -1941,17 +1924,11 @@ def execute_model( pooling_metadata = self.input_batch.make_pooling_metadata() ## No partial prefill, hence we can use the prompt lens here - cursor_kwargs: dict[str, Any] = {} - if has_argument(pooling_metadata.build_pooling_cursor, "seq_lens_cpu"): - cursor_kwargs["seq_lens_cpu"] = pooling_metadata.prompt_lens - - # v0.14.0 uses param "num_scheduled_tokens_np" - if has_argument(pooling_metadata.build_pooling_cursor, "num_scheduled_tokens_np"): - cursor_kwargs["num_scheduled_tokens_np"] = pooling_metadata.prompt_lens.numpy() - else: - cursor_kwargs["num_scheduled_tokens"] = pooling_metadata.prompt_lens.tolist() - - pooling_metadata.build_pooling_cursor(device=self.device, **cursor_kwargs) + pooling_metadata.build_pooling_cursor( + device=self.device, + seq_lens_cpu=pooling_metadata.prompt_lens, + num_scheduled_tokens_np=pooling_metadata.prompt_lens.numpy(), + ) # prepare unpadded output for the pooler hidden_state_list: list[torch.Tensor] = [] @@ -2002,11 +1979,7 @@ def __init__( # For hybrid KV caches, the `alignment_tokens` arg needs to be set to # the lowest common multiple of kv cache block sizes. Currently we only # support homogeneous kv caches with a single block size though. - self._alignment_token_kwargs = ( - {"alignment_tokens": self.block_size} - if has_argument(FullAttentionManager.find_longest_cache_hit, "alignment_tokens") - else {} - ) + self._alignment_token_kwargs = {"alignment_tokens": self.block_size} if vllm_config.cache_config.enable_prefix_caching: caching_hash_fn = get_hash_fn_by_name(vllm_config.cache_config.prefix_caching_hash_algo) @@ -2384,20 +2357,12 @@ def _plan_chunking(self, scheduler_request: Request) -> ChunkedPrefillPlan: # blocks in the last chunk to deduplicate the used blocks. So # although we will recompute, we'll still point the block table # to the cached blocks. - try: - # vllm >= v0.14.0 - self.kv_cache_manager.allocate_new_computed_blocks( - request_id=scheduler_request.request_id, - new_computed_blocks=computed_blocks, - num_local_computed_tokens=len(computed_blocks) * self.block_size, - num_external_computed_tokens=0, - ) - except (AttributeError, TypeError): - # vllm < v0.14.0 - self.kv_cache_manager.save_new_computed_blocks( - scheduler_request.request_id, - computed_blocks, - ) + self.kv_cache_manager.allocate_new_computed_blocks( + request_id=scheduler_request.request_id, + new_computed_blocks=computed_blocks, + num_local_computed_tokens=len(computed_blocks) * self.block_size, + num_external_computed_tokens=0, + ) else: usable_blocks = 0 n_hit = 0 @@ -2432,7 +2397,6 @@ def add_new_request(self, request: NewRequestData): prompt_token_ids=prompt_token_ids, sampling_params=request.sampling_params, pooling_params=None, - eos_token_id=None, block_hasher=self.request_block_hasher, mm_features=mm_features, ) diff --git a/vllm_spyre/v1/worker/spyre_worker.py b/vllm_spyre/v1/worker/spyre_worker.py index fb9ecf567..a80eba75f 100644 --- a/vllm_spyre/v1/worker/spyre_worker.py +++ b/vllm_spyre/v1/worker/spyre_worker.py @@ -21,12 +21,7 @@ from vllm.distributed import ensure_model_parallel_initialized, init_distributed_environment from vllm.logger import init_logger -try: - # vllm >= v0.14.0 - from vllm.utils.torch_utils import set_random_seed -except ImportError: - # vllm < v0.14.0 - from vllm.model_executor import set_random_seed # ty: ignore[unresolved-import] +from vllm.utils.torch_utils import set_random_seed from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams @@ -154,12 +149,15 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: """ return self.model_runner.get_kv_cache_spec() - def compile_or_warm_up_model(self) -> None: - """Prepare model for execution through compilation/warmup.""" + def compile_or_warm_up_model(self) -> float: + """Prepare model for execution through compilation/warmup. + + Returns: + The accumulated compilation time in seconds. + """ if envs_spyre.VLLM_SPYRE_USE_CB: - self._warmup_spyre_dynamic_size(self.restricted_tokens) - return + return self._warmup_spyre_dynamic_size(self.restricted_tokens) if self.model_runner.is_multimodal: raise NotImplementedError( "[WARMUP] Static batching is not supported for multimodal models." @@ -207,6 +205,7 @@ def compile_or_warm_up_model(self) -> None: num_shape_combinations, all_warmup_total_t, ) + return all_warmup_total_t def check_health(self) -> None: """Basic health check (override for device-specific checks).""" @@ -262,24 +261,13 @@ def __init__( distributed_init_method: str, is_driver_worker: bool = False, ) -> None: - try: - # pre 0.11.1 compatibility with old worker base class - from vllm.worker.worker_base import WorkerBase as LegacyWorkerBase # ty: ignore - - LegacyWorkerBase.__init__(self, vllm_config=vllm_config) - self.local_rank = local_rank - self.rank = rank - self.distributed_init_method = distributed_init_method - self.is_driver_worker = is_driver_worker - except ImportError: - # From 0.11.1 and on we should only have to call the super init - super().__init__( - vllm_config=vllm_config, - local_rank=local_rank, - rank=rank, - distributed_init_method=distributed_init_method, - is_driver_worker=is_driver_worker, - ) + super().__init__( + vllm_config=vllm_config, + local_rank=local_rank, + rank=rank, + distributed_init_method=distributed_init_method, + is_driver_worker=is_driver_worker, + ) # For power-user debugging of spyre logs for tensor parallel ops self.redirect_logs_to_files() @@ -289,22 +277,6 @@ def __init__( assert rank % self.parallel_config.tensor_parallel_size == 0, ( "Driver worker should be rank 0 of tensor parallel group." ) - if self.model_config.trust_remote_code: - # note: lazy import to avoid importing torch before initializing - try: - # pre 0.11.1 compatibility - from vllm.utils import init_cached_hf_modules # ty: ignore[unresolved-import] - - init_cached_hf_modules() - except ImportError: - # 0.11.1 to 0.13.0 compatibility - try: - from vllm.utils.import_utils import init_cached_hf_modules # ty: ignore[unresolved-import] - - init_cached_hf_modules() - except ImportError: - # >=0.14.0, init_cached_hf_modules is no longer needed - pass self.model_runner: Union[ StaticBatchingSpyreModelRunner, @@ -576,7 +548,7 @@ def load_model(self): self.perf_metrics.log("load model time", load_model_total_t, model=self.model_config.model) logger.info("load model took %.3fs", load_model_total_t) - def _warmup_spyre_dynamic_size(self, special_token_ids): + def _warmup_spyre_dynamic_size(self, special_token_ids) -> float: warmup_start_t = time.time() # satisfy mypy @@ -670,9 +642,6 @@ def _warmup_spyre_dynamic_size(self, special_token_ids): scheduled_cached_reqs=CachedRequestData.make_empty(), num_scheduled_tokens={deploy_req.req_id: prompt_len}, total_num_scheduled_tokens=prompt_len, - scheduled_spec_decode_tokens={}, - scheduled_encoder_inputs={}, - num_common_prefix_blocks=[], finished_req_ids=set(), **_get_extra_args(), ) @@ -692,6 +661,7 @@ def _warmup_spyre_dynamic_size(self, special_token_ids): ) maybe_override_signals_handler() + return warmup_total_t def _cleanup_model_runner(self, request) -> None: # Needed to clean up the data of model runner @@ -701,9 +671,6 @@ def _cleanup_model_runner(self, request) -> None: num_scheduled_tokens={}, # NOTE: this means no work to do total_num_scheduled_tokens=0, - scheduled_spec_decode_tokens={}, - scheduled_encoder_inputs={}, - num_common_prefix_blocks=[], # The requests to be removed finished_req_ids=set([r.req_id for r in request]), **_get_extra_args(), @@ -783,9 +750,6 @@ def _warmup_spyre_fixed_size( scheduled_cached_reqs=cached_request_data, num_scheduled_tokens={r.req_id: self._get_num_tokens(r) for r in dummy_requests}, total_num_scheduled_tokens=sum(prompt_len for _ in range(batch_size)), - scheduled_spec_decode_tokens={}, - scheduled_encoder_inputs={}, - num_common_prefix_blocks=[], finished_req_ids=set(), **_get_extra_args(), ) @@ -855,9 +819,6 @@ def _dynamic_warmup( scheduled_cached_reqs=CachedRequestData.make_empty(), num_scheduled_tokens={req.req_id: prompt_len}, total_num_scheduled_tokens=prompt_len, - scheduled_spec_decode_tokens={}, - scheduled_encoder_inputs={}, - num_common_prefix_blocks=[], finished_req_ids=set(), **_get_extra_args(), ) @@ -886,9 +847,6 @@ def _dynamic_warmup( scheduled_cached_reqs=cached_request_data, num_scheduled_tokens={req.req_id: 1 for req in requests}, total_num_scheduled_tokens=1, - scheduled_spec_decode_tokens={}, - scheduled_encoder_inputs={}, - num_common_prefix_blocks=[], finished_req_ids=set(), **_get_extra_args(), ) @@ -996,14 +954,10 @@ def signal_handler(signum, frame): def _get_extra_args() -> dict: - """Add any required backwards compatibility code for constructing - SchedulerOutputs here""" - extra_args: dict = {} - extra_args.update({"free_encoder_mm_hashes": []}) - - if "structured_output_request_ids" in dataclass_fields(SchedulerOutput): - extra_args["structured_output_request_ids"] = {} - if "grammar_bitmask" in dataclass_fields(SchedulerOutput): - extra_args["grammar_bitmask"] = None - - return extra_args + """Add any required extra args for constructing SchedulerOutputs""" + return { + "free_encoder_mm_hashes": [], + "scheduled_spec_decode_tokens": {}, + "scheduled_encoder_inputs": {}, + "num_common_prefix_blocks": [], + }