From d919aa81e108fcdd15a93d340a1af0e12eae9147 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Aug 2025 08:23:45 +0000 Subject: [PATCH 001/130] [Core] Enable HF processing on GPU Signed-off-by: DarkLight1337 --- docs/configuration/conserving_memory.md | 30 ++--- docs/configuration/optimization.md | 70 +++------- tests/entrypoints/llm/test_chat.py | 56 +++++++- tests/entrypoints/openai/test_audio.py | 97 ++++---------- tests/entrypoints/openai/test_vision.py | 153 ++++++--------------- vllm/config.py | 10 ++ vllm/engine/arg_utils.py | 30 +++-- vllm/inputs/registry.py | 17 ++- vllm/v1/worker/gpu_model_runner.py | 169 +++++++++++++----------- vllm/v1/worker/tpu_model_runner.py | 13 +- 10 files changed, 287 insertions(+), 358 deletions(-) diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md index 4d5c961af98f..75d19e4420f4 100644 --- a/docs/configuration/conserving_memory.md +++ b/docs/configuration/conserving_memory.md @@ -129,20 +129,18 @@ reduce the size of the processed multi-modal inputs, which in turn saves memory. Here are some examples: -??? code - - ```python - from vllm import LLM +```python +from vllm import LLM - # Available for Qwen2-VL series models - llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - mm_processor_kwargs={ - "max_pixels": 768 * 768, # Default is 1280 * 28 * 28 - }) - - # Available for InternVL series models - llm = LLM(model="OpenGVLab/InternVL2-2B", - mm_processor_kwargs={ - "max_dynamic_patch": 4, # Default is 12 - }) - ``` +# Available for Qwen2-VL series models +llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", + mm_processor_kwargs={ + "max_pixels": 768 * 768, # Default is 1280 * 28 * 28 + }) + +# Available for InternVL series models +llm = LLM(model="OpenGVLab/InternVL2-2B", + mm_processor_kwargs={ + "max_dynamic_patch": 4, # Default is 12 + }) +``` diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index 811925c19e63..9576125b86f7 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -2,6 +2,9 @@ This guide covers optimization strategies and performance tuning for vLLM V1. +!!! tip + Running out of memory? Consult [this guide](./conserving_memory.md) on how to conserve memory. + ## Preemption Due to the auto-regressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests. @@ -126,62 +129,25 @@ Data parallelism replicates the entire model across multiple GPU sets and proces Data parallelism can be combined with the other parallelism strategies and is set by `data_parallel_size=N`. Note that MoE layers will be sharded according to the product of the tensor parallel size and data parallel size. -## Reducing Memory Usage - -If you encounter out-of-memory issues, consider these strategies: - -### Context Length and Batch Size - -You can reduce memory usage by limiting the context length and batch size: - -```python -from vllm import LLM - -llm = LLM( - model="meta-llama/Llama-3.1-8B-Instruct", - max_model_len=2048, # Limit context window - max_num_seqs=4 # Limit batch size -) -``` - -### Adjust CUDA Graph Compilation - -CUDA graph compilation in V1 uses more memory than in V0. You can reduce memory usage by adjusting the compilation level: - -```python -from vllm import LLM -from vllm.config import CompilationConfig, CompilationLevel - -llm = LLM( - model="meta-llama/Llama-3.1-8B-Instruct", - compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, - cudagraph_capture_sizes=[1, 2, 4, 8] # Capture fewer batch sizes - ) -) -``` +### Multi-modal processing using GPU -Or, if you are not concerned about latency or overall performance, disable CUDA graph compilation entirely with `enforce_eager=True`: +You can speed up input processing by running Hugging Face processors on the GPU. +To support this, the processor must accept a `device` argument in its call signature. +As of this writing, the following processors are known to support GPU acceleration: -```python -from vllm import LLM +- Descendants of `BaseImageProcessorFast` (requires `use_fast=True`) +- Descendants of `BaseVideoProcessor` +- `WhisperFeatureExtractor` -llm = LLM( - model="meta-llama/Llama-3.1-8B-Instruct", - enforce_eager=True # Disable CUDA graph compilation -) -``` - -### Multimodal Models - -For multi-modal models, you can reduce memory usage by limiting the number of images/videos per request: +To run Hugging Face processors on the GPU, you can pass the `device` argument +(and `use_fast` if needed) via `mm_processor_kwargs`: ```python -from vllm import LLM +# Fast image processor requires use_fast=True +llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", + mm_processor_kwargs={"use_fast": True, "device": "cuda"}) -# Accept up to 2 images per prompt -llm = LLM( - model="Qwen/Qwen2.5-VL-3B-Instruct", - limit_mm_per_prompt={"image": 2} -) +# Whisper feature extractor does not require use_fast +llm = LLM(model="Qwen/Qwen2-Audio-7B-Instruct", + mm_processor_kwargs={"device": "cuda"}) ``` diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py index 97cf3b5ce8fc..d4272dfca0fd 100644 --- a/tests/entrypoints/llm/test_chat.py +++ b/tests/entrypoints/llm/test_chat.py @@ -6,8 +6,12 @@ from vllm import LLM from vllm.distributed import cleanup_dist_env_and_memory +from vllm.platforms import current_platform -from ..openai.test_vision import TEST_IMAGE_URLS +from ..openai.test_audio import (TEST_AUDIO_URLS, + get_dummy_messages_from_audio_url) +from ..openai.test_vision import (TEST_IMAGE_URLS, + get_dummy_messages_from_image_url) @pytest.fixture(scope="function") @@ -195,3 +199,53 @@ def test_chat_extra_kwargs(thinking_llm, enable_thinking): else: # The chat template includes dummy thinking process assert think_id in prompt_token_ids + + +def _get_messages(modality: str): + if modality == "image": + return [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": [{ + "type": "text", + "text": "What is in this image?" + }] + }, + ] + + +@pytest.mark.parametrize(("model_id", "modality", "mm_init_kwargs"), [ + ("Qwen/Qwen2.5-VL-3B-Instruct", "image", { + "use_fast": True + }), + ("Qwen/Qwen2-Audio-7B-Instruct", "audio", {}), +]) +def test_mm_processing_gpu(model_id, modality, mm_init_kwargs): + if modality == "image": + messages = get_dummy_messages_from_image_url(TEST_IMAGE_URLS[0]) + elif modality == "audio": + messages = get_dummy_messages_from_audio_url(TEST_AUDIO_URLS[0]) + else: + raise NotImplementedError(modality) + + device = current_platform.device_name + llm = LLM( + model=model_id, + max_model_len=6144, + max_num_seqs=2, + enforce_eager=True, + seed=0, + mm_processor_kwargs=mm_init_kwargs | {"device": device}, + ) + + outputs = llm.chat(messages) + assert len(outputs) == 1 + + if device != "cpu": + match = "cannot override the device for multi-modal preprocessing" + with pytest.raises(ValueError, match=match): + llm.chat(messages, mm_processor_kwargs={"device": "cpu"}) diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index d67c05ab3e8d..f861013b5dfe 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json +from typing import Union import openai import pytest @@ -51,28 +52,38 @@ def base64_encoded_audio() -> dict[str, str]: } -@pytest.mark.asyncio -@pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]]) -async def test_single_chat_session_audio(client: openai.AsyncOpenAI, - model_name: str, audio_url: str): - messages = [{ +def get_dummy_messages_from_audio_url( + audio_urls: Union[str, list[str]], + content_text: str = "What's happening in this audio?", +): + if isinstance(audio_urls, str): + audio_urls = [audio_urls] + + return [{ "role": "user", "content": [ - { + *({ "type": "audio_url", "audio_url": { "url": audio_url } - }, + } for audio_url in audio_urls), { "type": "text", - "text": "What's happening in this audio?" + "text": content_text }, ], }] + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]]) +async def test_single_chat_session_audio(client: openai.AsyncOpenAI, + model_name: str, audio_url: str): + messages = get_dummy_messages_from_audio_url(audio_url) + # test single completion chat_completion = await client.chat.completions.create( model=model_name, @@ -111,20 +122,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI, async def test_error_on_invalid_audio_url_type(client: openai.AsyncOpenAI, model_name: str, audio_url: str): - messages = [{ - "role": - "user", - "content": [ - { - "type": "audio_url", - "audio_url": audio_url - }, - { - "type": "text", - "text": "What's happening in this audio?" - }, - ], - }] + messages = get_dummy_messages_from_audio_url(audio_url) # audio_url should be a dict {"url": "some url"}, not directly a string with pytest.raises(openai.BadRequestError): @@ -141,23 +139,8 @@ async def test_single_chat_session_audio_base64encoded( client: openai.AsyncOpenAI, model_name: str, audio_url: str, base64_encoded_audio: dict[str, str]): - messages = [{ - "role": - "user", - "content": [ - { - "type": "audio_url", - "audio_url": { - "url": - f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}" - } - }, - { - "type": "text", - "text": "What's happening in this audio?" - }, - ], - }] + messages = get_dummy_messages_from_audio_url( + f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}") # test single completion chat_completion = await client.chat.completions.create( @@ -252,22 +235,7 @@ async def test_single_chat_session_input_audio( @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) async def test_chat_streaming_audio(client: openai.AsyncOpenAI, model_name: str, audio_url: str): - messages = [{ - "role": - "user", - "content": [ - { - "type": "audio_url", - "audio_url": { - "url": audio_url - } - }, - { - "type": "text", - "text": "What's happening in this audio?" - }, - ], - }] + messages = messages = get_dummy_messages_from_audio_url(audio_url) # test single completion chat_completion = await client.chat.completions.create( @@ -371,22 +339,7 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI, async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str, audio_urls: list[str]): - messages = [{ - "role": - "user", - "content": [ - *({ - "type": "audio_url", - "audio_url": { - "url": audio_url - } - } for audio_url in audio_urls), - { - "type": "text", - "text": "What's happening in this audio?" - }, - ], - }] + messages = get_dummy_messages_from_audio_url(audio_urls) if len(audio_urls) > MAXIMUM_AUDIOS: with pytest.raises(openai.BadRequestError): # test multi-audio input diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 8259a81d7b6a..88732edd6a85 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json +from typing import Union import openai import pytest @@ -78,6 +79,31 @@ def base64_encoded_image() -> dict[str, str]: } +def get_dummy_messages_from_image_url( + image_urls: Union[str, list[str]], + content_text: str = "What's in this image?", +): + if isinstance(image_urls, str): + image_urls = [image_urls] + + return [{ + "role": + "user", + "content": [ + *({ + "type": "image_url", + "image_url": { + "url": image_url + } + } for image_url in image_urls), + { + "type": "text", + "text": content_text + }, + ], + }] + + def get_hf_prompt_tokens(model_name, content, image_url): processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, @@ -103,22 +129,7 @@ def get_hf_prompt_tokens(model_name, content, image_url): async def test_single_chat_session_image(client: openai.AsyncOpenAI, model_name: str, image_url: str): content_text = "What's in this image?" - messages = [{ - "role": - "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": image_url - } - }, - { - "type": "text", - "text": content_text - }, - ], - }] + messages = get_dummy_messages_from_image_url(image_url, content_text) max_completion_tokens = 10 # test single completion @@ -164,20 +175,7 @@ async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI, model_name: str, image_url: str): content_text = "What's in this image?" - messages = [{ - "role": - "user", - "content": [ - { - "type": "image_url", - "image_url": image_url - }, - { - "type": "text", - "text": content_text - }, - ], - }] + messages = get_dummy_messages_from_image_url(image_url, content_text) # image_url should be a dict {"url": "some url"}, not directly a string with pytest.raises(openai.BadRequestError): @@ -193,22 +191,8 @@ async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI, async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI, model_name: str, image_url: str): - messages = [{ - "role": - "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": image_url - } - }, - { - "type": "text", - "text": "What's in this image?" - }, - ], - }] + content_text = "What's in this image?" + messages = get_dummy_messages_from_image_url(image_url, content_text) chat_completion = await client.chat.completions.create( model=model_name, @@ -229,25 +213,11 @@ async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI, async def test_single_chat_session_image_base64encoded( client: openai.AsyncOpenAI, model_name: str, image_url: str, base64_encoded_image: dict[str, str]): - content_text = "What's in this image?" - messages = [{ - "role": - "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": - f"data:image/jpeg;base64,{base64_encoded_image[image_url]}" - } - }, - { - "type": "text", - "text": content_text - }, - ], - }] + messages = get_dummy_messages_from_image_url( + f"data:image/jpeg;base64,{base64_encoded_image[image_url]}", + content_text, + ) max_completion_tokens = 10 # test single completion @@ -297,23 +267,9 @@ async def test_single_chat_session_image_base64encoded_beamsearch( image_url = TEST_IMAGE_URLS[image_idx] expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx] - messages = [{ - "role": - "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": - f"data:image/jpeg;base64,{base64_encoded_image[image_url]}" - } - }, - { - "type": "text", - "text": "What's in this image?" - }, - ], - }] + messages = get_dummy_messages_from_image_url( + f"data:image/jpeg;base64,{base64_encoded_image[image_url]}") + chat_completion = await client.chat.completions.create( model=model_name, messages=messages, @@ -331,22 +287,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch( @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) async def test_chat_streaming_image(client: openai.AsyncOpenAI, model_name: str, image_url: str): - messages = [{ - "role": - "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": image_url - } - }, - { - "type": "text", - "text": "What's in this image?" - }, - ], - }] + messages = get_dummy_messages_from_image_url(image_url) # test single completion chat_completion = await client.chat.completions.create( @@ -390,23 +331,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI, [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))]) async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str, image_urls: list[str]): - - messages = [{ - "role": - "user", - "content": [ - *({ - "type": "image_url", - "image_url": { - "url": image_url - } - } for image_url in image_urls), - { - "type": "text", - "text": "What's in this image?" - }, - ], - }] + messages = get_dummy_messages_from_image_url(image_urls) if len(image_urls) > MAXIMUM_IMAGES: with pytest.raises(openai.BadRequestError): # test multi-image input diff --git a/vllm/config.py b/vllm/config.py index 93daab7d6ae9..4be10328a4b6 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3348,6 +3348,16 @@ def merge_mm_processor_kwargs( according to the extra arguments passed during inference. """ kwargs = self.mm_processor_kwargs or {} + + # This is to avoid breaking assumptions in memory profiling + if (init_device := kwargs.get("device", + "cpu")) != (inference_device := + inference_kwargs.get( + "device", init_device)): + raise ValueError( + "You cannot override the device for multi-modal preprocessing " + f"at runtime! Found: {init_device=} vs. {inference_device=}") + return kwargs | dict(inference_kwargs) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 78272d983eaf..a7d6078b9ea9 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1197,17 +1197,25 @@ def create_engine_config( enable_multimodal_encoder_data_parallel, ) - supports_mm_preprocessor_cache = (self.data_parallel_size == 1 - or data_parallel_external_lb) - if (not supports_mm_preprocessor_cache - and model_config.is_multimodal_model - and not model_config.disable_mm_preprocessor_cache): - logger.warning( - "Multi-modal preprocessor cache is not compatible " - "with data parallelism when there does not exist a " - "one-to-one correspondance between API process and " - "EngineCore process, so the cache will be disabled.") - model_config.set_disable_mm_preprocessor_cache(True) + if model_config.is_multimodal_model: + mm_processor_kwargs = model_config.mm_processor_kwargs or {} + if (mm_processor_kwargs.get("device", "cpu") != "cpu" + and not model_config.disable_mm_preprocessor_cache): + logger.info("Multi-modal preprocessor cache is automatically " + "disabled to optimize the performance of " + "GPU-accelerated multi-modal processor.") + model_config.set_disable_mm_preprocessor_cache(True) + + supports_mm_preprocessor_cache = (self.data_parallel_size == 1 + or data_parallel_external_lb) + if (not supports_mm_preprocessor_cache + and not model_config.disable_mm_preprocessor_cache): + logger.warning( + "Multi-modal preprocessor cache is not compatible " + "with data parallelism when there does not exist a " + "one-to-one correspondance between API process and " + "EngineCore process, so the cache will be disabled.") + model_config.set_disable_mm_preprocessor_cache(True) speculative_config = self.create_speculative_config( target_model_config=model_config, diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 6331a70b469a..ce47bc928c4b 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -162,11 +162,22 @@ def call_hf_processor( requires_kw_only=False, allow_var_kwargs=True, ) + is_gpu = allowed_kwargs.get("device", "cpu") != "cpu" def maybe_cast_dtype(x): - # This mimics the behavior of transformers.BatchFeature - if isinstance(x, torch.Tensor) and x.is_floating_point(): - return x.to(dtype=self.model_config.dtype) + if isinstance(x, torch.Tensor): + # This mimics the behavior of transformers.BatchFeature + if x.is_floating_point(): + x = x.to(dtype=self.model_config.dtype) + + # This is required because we need to transfer the data + # to engine core, and the serialization process expects + # CPU tensors. + # The dtype of model config is usually lower precision + # so we call this last to transfer less data to CPU + if is_gpu: + x = x.cpu() + return x try: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 29cda4d837bf..1454bc0ed5a9 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -36,7 +36,8 @@ from vllm.model_executor.models.interfaces_base import ( VllmModelForPooling, is_pooling_model, is_text_generation_model) from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange +from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargs, + PlaceholderRange) from vllm.multimodal.utils import group_mm_inputs_by_modality from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingType @@ -584,14 +585,11 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Refresh batch metadata with any pending updates. self.input_batch.refresh_metadata() - def _init_model_kwargs_for_multimodal_model( + def _extract_mm_kwargs( self, - scheduler_output: Optional["SchedulerOutput"] = None, - num_reqs: int = -1, - ) -> dict[str, Any]: - - model_kwargs: dict[str, Any] = {} - if self.is_multimodal_raw_input_supported: + scheduler_output: "SchedulerOutput", + ) -> BatchedTensorInputs: + if self.is_multimodal_raw_input_supported: # noqa: SIM102 # This model requires the raw multimodal data in input. if scheduler_output: multi_modal_kwargs_list = [] @@ -600,21 +598,17 @@ def _init_model_kwargs_for_multimodal_model( if not isinstance(req_mm_inputs, list): req_mm_inputs = list(req_mm_inputs) multi_modal_kwargs_list.extend(req_mm_inputs) - multi_modal_kwargs = MultiModalKwargs.batch( - multi_modal_kwargs_list) - else: - # The only case where SchedulerOutput is None is for - # a dummy run let's get some dummy data. - dummy_data = [ - self.mm_registry.get_decoder_dummy_data( - model_config=self.model_config, - seq_len=1).multi_modal_data for i in range(num_reqs) - ] - multi_modal_kwargs = MultiModalKwargs.batch(dummy_data) - model_kwargs.update(multi_modal_kwargs) + return MultiModalKwargs.batch(multi_modal_kwargs_list) - return model_kwargs + return {} + + def _dummy_mm_kwargs(self, num_seqs: int) -> BatchedTensorInputs: + if self.is_multimodal_raw_input_supported: + dummy_data_modality, _ = self._get_modality_with_max_tokens() + return self._get_mm_dummy_batch(dummy_data_modality, num_seqs) + + return {} def _get_cumsum_and_arange( self, @@ -1526,9 +1520,6 @@ def execute_model( # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. input_ids = self.input_ids[:num_scheduled_tokens] - - model_kwargs = self._init_model_kwargs_for_multimodal_model( - scheduler_output=scheduler_output) inputs_embeds = self.model.get_input_embeddings( input_ids=input_ids, multimodal_embeddings=mm_embeds or None, @@ -1538,6 +1529,7 @@ def execute_model( self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds) inputs_embeds = self.inputs_embeds[:num_input_tokens] input_ids = None + model_kwargs = self._extract_mm_kwargs(scheduler_output) else: # For text-only models, we use token ids as input. # While it is possible to use embeddings as input just like the @@ -2165,6 +2157,77 @@ def rand_input_ids() -> torch.Tensor: yield input_ids.fill_(0) + def _get_modality_with_max_tokens(self): + # NOTE: Currently model is profiled with a single non-text + # modality with the max possible input tokens even when + # it supports multiple. + max_tokens_by_modality_dict = self.mm_registry \ + .get_max_tokens_per_item_by_nonzero_modality(self.model_config) + dummy_data_modality, max_tokens_per_mm_item = max( + max_tokens_by_modality_dict.items(), key=lambda item: item[1]) + + return dummy_data_modality, max_tokens_per_mm_item + + def _get_mm_dummy_params(self): + ( + dummy_data_modality, + max_tokens_per_mm_item, + ) = self._get_modality_with_max_tokens() + + # Check how many items of this modality can be supported by + # the encoder budget. + encoder_budget = min(self.max_num_encoder_input_tokens, + self.encoder_cache_size) + + max_num_mm_items_encoder_budget = encoder_budget // \ + max_tokens_per_mm_item + + # Check how many items of this modality can be supported by + # the decoder budget. + max_mm_items_per_req = self.mm_registry.get_mm_limits_per_prompt( + self.model_config)[dummy_data_modality] + + # NOTE: We do not consider max_num_batched_tokens on purpose + # because the multimodal embeddings can be generated in advance + # and chunked prefilled. + max_num_mm_items_decoder_budget = self.max_num_reqs * \ + max_mm_items_per_req + + max_num_mm_items = max( + 1, + min(max_num_mm_items_encoder_budget, + max_num_mm_items_decoder_budget)) + + logger.info( + "Encoder cache will be initialized with a budget of %s tokens, " + "and profiled with %s %s items of the maximum feature size.", + encoder_budget, max_num_mm_items, dummy_data_modality) + + return dummy_data_modality, max_num_mm_items + + def _get_mm_dummy_batch(self, modality: str, + batch_size: int) -> BatchedTensorInputs: + """Dummy data for profiling and precompiling multimodal models.""" + dummy_request_data = self.mm_registry.get_decoder_dummy_data( + model_config=self.model_config, + seq_len=self.max_num_tokens, + mm_counts={modality: batch_size}, + ) + dummy_mm_data = dummy_request_data.multi_modal_data + + # When models have a merged processor, their dummy data is + # already batched `MultiModalKwargs`, therefore we take the first + # `MultiModalKwargsItem` from the desired modality to profile on. + dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0) + dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item]) + + batched_dummy_mm_inputs = MultiModalKwargs.batch([dummy_mm_kwargs] * + batch_size) + return MultiModalKwargs.as_kwargs( + batched_dummy_mm_inputs, + device=self.device, + ) + @torch.inference_mode() def _dummy_run( self, @@ -2231,10 +2294,9 @@ def _dummy_run( num_scheduled_tokens): model = self.model if self.is_multimodal_model: - model_kwargs = self._init_model_kwargs_for_multimodal_model( - num_reqs=num_reqs) input_ids = None inputs_embeds = self.inputs_embeds[:num_tokens] + model_kwargs = self._dummy_mm_kwargs(num_reqs) else: input_ids = self.input_ids[:num_tokens] inputs_embeds = None @@ -2442,60 +2504,11 @@ def profile_run(self) -> None: # TODO: handle encoder-decoder models once we support them. if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0 and self.encoder_cache_size > 0): - - # NOTE: Currently model is profiled with a single non-text - # modality with the max possible input tokens even when - # it supports multiple. - max_tokens_by_modality_dict = self.mm_registry \ - .get_max_tokens_per_item_by_nonzero_modality(self.model_config) - dummy_data_modality, max_tokens_per_mm_item = max( - max_tokens_by_modality_dict.items(), key=lambda item: item[1]) - - # Check how many items of this modality can be supported by - # the encoder budget. - encoder_budget = min(self.max_num_encoder_input_tokens, - self.encoder_cache_size) - - max_num_mm_items_encoder_budget = encoder_budget // \ - max_tokens_per_mm_item - - # Check how many items of this modality can be supported by - # the decoder budget. - max_mm_items_per_req = self.mm_registry.get_mm_limits_per_prompt( - self.model_config)[dummy_data_modality] - - # NOTE: We do not consider max_num_batched_tokens on purpose - # because the multimodal embeddings can be generated in advance - # and chunked prefilled. - max_num_mm_items_decoder_budget = self.max_num_reqs * \ - max_mm_items_per_req - - max_num_mm_items = max( - 1, - min(max_num_mm_items_encoder_budget, - max_num_mm_items_decoder_budget)) - - logger.info( - "Encoder cache will be initialized with a budget of %s tokens," - " and profiled with %s %s items of the maximum feature size.", - encoder_budget, max_num_mm_items, dummy_data_modality) + dummy_data_modality, max_num_mm_items = self._get_mm_dummy_params() # Create dummy batch of multimodal inputs. - dummy_mm_kwargs = self.mm_registry.get_decoder_dummy_data( - model_config=self.model_config, - seq_len=max_tokens_per_mm_item, - mm_counts={ - dummy_data_modality: 1 - }, - ).multi_modal_data - - batched_dummy_mm_inputs = MultiModalKwargs.batch( - [dummy_mm_kwargs] * max_num_mm_items, - pin_memory=self.pin_memory) - batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs( - batched_dummy_mm_inputs, - device=self.device, - ) + batched_dummy_mm_inputs = self._get_mm_dummy_batch( + dummy_data_modality, max_num_mm_items) # Run multimodal encoder. dummy_encoder_outputs = self.model.get_multimodal_embeddings( diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 59cbb0150570..1fabe06606ea 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1810,23 +1810,14 @@ def prepare_structured_decoding_input( def _get_mm_dummy_batch(self, modality: str, batch_size: int) -> BatchedTensorInputs: - # Dummy data for pre-compiling multimodal models. + """Dummy data for profiling and precompiling multimodal models.""" dummy_request_data = self.mm_registry.get_decoder_dummy_data( model_config=self.model_config, seq_len=self.max_num_tokens, + mm_counts={modality: batch_size}, ) dummy_mm_data = dummy_request_data.multi_modal_data - # Dummy data definition in V0 may contain multiple multimodal items - # (e.g, multiple images) for a single request, therefore here we - # always replicate first item by max_num_mm_items times since in V1 - # they are scheduled to be processed separately. - assert isinstance(dummy_mm_data, MultiModalKwargs), ( - "Expected dummy multimodal data to be of type " - f"MultiModalKwargs, got {type(dummy_mm_data)=} instead. " - "This is most likely due to the model not having a merged " - "processor.") - # When models have a merged processor, their dummy data is # already batched `MultiModalKwargs`, therefore we take the first # `MultiModalKwargsItem` from the desired modality to profile on. From 9af73f35522fd07ab4bf9c63ea273101744a3653 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Aug 2025 08:27:41 +0000 Subject: [PATCH 002/130] Remove unused function Signed-off-by: DarkLight1337 --- tests/entrypoints/llm/test_chat.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py index d4272dfca0fd..43f44ee05e78 100644 --- a/tests/entrypoints/llm/test_chat.py +++ b/tests/entrypoints/llm/test_chat.py @@ -201,23 +201,6 @@ def test_chat_extra_kwargs(thinking_llm, enable_thinking): assert think_id in prompt_token_ids -def _get_messages(modality: str): - if modality == "image": - return [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": [{ - "type": "text", - "text": "What is in this image?" - }] - }, - ] - - @pytest.mark.parametrize(("model_id", "modality", "mm_init_kwargs"), [ ("Qwen/Qwen2.5-VL-3B-Instruct", "image", { "use_fast": True From 3a560a061ae55851feaed12f4401f74eccccaf6b Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Aug 2025 08:28:29 +0000 Subject: [PATCH 003/130] Format Signed-off-by: DarkLight1337 --- vllm/config.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 4be10328a4b6..ffb43db21797 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3350,10 +3350,9 @@ def merge_mm_processor_kwargs( kwargs = self.mm_processor_kwargs or {} # This is to avoid breaking assumptions in memory profiling - if (init_device := kwargs.get("device", - "cpu")) != (inference_device := - inference_kwargs.get( - "device", init_device)): + init_device = kwargs.get("device", "cpu") + inference_device = inference_kwargs.get("device", init_device) + if init_device != inference_device: raise ValueError( "You cannot override the device for multi-modal preprocessing " f"at runtime! Found: {init_device=} vs. {inference_device=}") From 3a3e8c249328af8ec4f4a23979ad16bcef2e293b Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Aug 2025 08:29:12 +0000 Subject: [PATCH 004/130] Rename Signed-off-by: DarkLight1337 --- vllm/engine/arg_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index a7d6078b9ea9..89a432981cda 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1206,9 +1206,9 @@ def create_engine_config( "GPU-accelerated multi-modal processor.") model_config.set_disable_mm_preprocessor_cache(True) - supports_mm_preprocessor_cache = (self.data_parallel_size == 1 - or data_parallel_external_lb) - if (not supports_mm_preprocessor_cache + dp_supports_mm_preprocessor_cache = (self.data_parallel_size == 1 + or data_parallel_external_lb) + if (not dp_supports_mm_preprocessor_cache and not model_config.disable_mm_preprocessor_cache): logger.warning( "Multi-modal preprocessor cache is not compatible " From ffff508242c3c40cdf06c61cbe5d96b32080cc79 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Aug 2025 08:32:25 +0000 Subject: [PATCH 005/130] Address comment Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 1 + vllm/v1/worker/tpu_model_runner.py | 1 + 2 files changed, 2 insertions(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 1454bc0ed5a9..038f3b865ca4 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2208,6 +2208,7 @@ def _get_mm_dummy_params(self): def _get_mm_dummy_batch(self, modality: str, batch_size: int) -> BatchedTensorInputs: """Dummy data for profiling and precompiling multimodal models.""" + # NOTE: Use the full batch size to profile running preprocessor on GPU dummy_request_data = self.mm_registry.get_decoder_dummy_data( model_config=self.model_config, seq_len=self.max_num_tokens, diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 1fabe06606ea..6458958066d3 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1811,6 +1811,7 @@ def prepare_structured_decoding_input( def _get_mm_dummy_batch(self, modality: str, batch_size: int) -> BatchedTensorInputs: """Dummy data for profiling and precompiling multimodal models.""" + # NOTE: Use the full batch size to profile running preprocessor on GPU dummy_request_data = self.mm_registry.get_decoder_dummy_data( model_config=self.model_config, seq_len=self.max_num_tokens, From 9302a3c10d96e124920213e3c8a94b8f3b950c47 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Aug 2025 08:38:47 +0000 Subject: [PATCH 006/130] Make the test more useful Signed-off-by: DarkLight1337 --- tests/entrypoints/llm/test_chat.py | 38 ++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py index 43f44ee05e78..92a41941050b 100644 --- a/tests/entrypoints/llm/test_chat.py +++ b/tests/entrypoints/llm/test_chat.py @@ -208,6 +208,8 @@ def test_chat_extra_kwargs(thinking_llm, enable_thinking): ("Qwen/Qwen2-Audio-7B-Instruct", "audio", {}), ]) def test_mm_processing_gpu(model_id, modality, mm_init_kwargs): + device = current_platform.device_name + if modality == "image": messages = get_dummy_messages_from_image_url(TEST_IMAGE_URLS[0]) elif modality == "audio": @@ -215,7 +217,6 @@ def test_mm_processing_gpu(model_id, modality, mm_init_kwargs): else: raise NotImplementedError(modality) - device = current_platform.device_name llm = LLM( model=model_id, max_model_len=6144, @@ -228,7 +229,34 @@ def test_mm_processing_gpu(model_id, modality, mm_init_kwargs): outputs = llm.chat(messages) assert len(outputs) == 1 - if device != "cpu": - match = "cannot override the device for multi-modal preprocessing" - with pytest.raises(ValueError, match=match): - llm.chat(messages, mm_processor_kwargs={"device": "cpu"}) + +@pytest.mark.parametrize(("model_id", "modality", "mm_init_kwargs"), [ + ("Qwen/Qwen2.5-VL-3B-Instruct", "image", { + "use_fast": True + }), + ("Qwen/Qwen2-Audio-7B-Instruct", "audio", {}), +]) +def test_mm_processing_gpu_bad_device(model_id, modality, mm_init_kwargs): + device = current_platform.device_name + if device == "cpu": + pytest.skip("Not applicable to CPU") + + if modality == "image": + messages = get_dummy_messages_from_image_url(TEST_IMAGE_URLS[0]) + elif modality == "audio": + messages = get_dummy_messages_from_audio_url(TEST_AUDIO_URLS[0]) + else: + raise NotImplementedError(modality) + + llm = LLM( + model=model_id, + max_model_len=6144, + max_num_seqs=2, + enforce_eager=True, + seed=0, + mm_processor_kwargs=mm_init_kwargs, + ) + + match = "cannot override the device for multi-modal preprocessing" + with pytest.raises(ValueError, match=match): + llm.chat(messages, mm_processor_kwargs={"device": device}) From 91a33adc7c12c67417792f43e3d8ad5d9b591732 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Aug 2025 08:43:12 +0000 Subject: [PATCH 007/130] Update the test Signed-off-by: DarkLight1337 --- tests/entrypoints/llm/test_chat.py | 18 ++++++++++-------- tests/entrypoints/openai/test_audio.py | 12 ++++++------ tests/entrypoints/openai/test_vision.py | 16 ++++++++-------- 3 files changed, 24 insertions(+), 22 deletions(-) diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py index 92a41941050b..203499216f73 100644 --- a/tests/entrypoints/llm/test_chat.py +++ b/tests/entrypoints/llm/test_chat.py @@ -8,10 +8,8 @@ from vllm.distributed import cleanup_dist_env_and_memory from vllm.platforms import current_platform -from ..openai.test_audio import (TEST_AUDIO_URLS, - get_dummy_messages_from_audio_url) -from ..openai.test_vision import (TEST_IMAGE_URLS, - get_dummy_messages_from_image_url) +from ..openai.test_audio import TEST_AUDIO_URLS, dummy_messages_from_audio_url +from ..openai.test_vision import TEST_IMAGE_URLS, dummy_messages_from_image_url @pytest.fixture(scope="function") @@ -210,10 +208,11 @@ def test_chat_extra_kwargs(thinking_llm, enable_thinking): def test_mm_processing_gpu(model_id, modality, mm_init_kwargs): device = current_platform.device_name + num_items = 2 if modality == "image": - messages = get_dummy_messages_from_image_url(TEST_IMAGE_URLS[0]) + messages = dummy_messages_from_image_url(TEST_IMAGE_URLS[:num_items]) elif modality == "audio": - messages = get_dummy_messages_from_audio_url(TEST_AUDIO_URLS[0]) + messages = dummy_messages_from_audio_url(TEST_AUDIO_URLS[:num_items]) else: raise NotImplementedError(modality) @@ -223,6 +222,7 @@ def test_mm_processing_gpu(model_id, modality, mm_init_kwargs): max_num_seqs=2, enforce_eager=True, seed=0, + limit_mm_per_prompt={modality: num_items}, mm_processor_kwargs=mm_init_kwargs | {"device": device}, ) @@ -241,10 +241,11 @@ def test_mm_processing_gpu_bad_device(model_id, modality, mm_init_kwargs): if device == "cpu": pytest.skip("Not applicable to CPU") + num_items = 1 if modality == "image": - messages = get_dummy_messages_from_image_url(TEST_IMAGE_URLS[0]) + messages = dummy_messages_from_image_url(TEST_IMAGE_URLS[:num_items]) elif modality == "audio": - messages = get_dummy_messages_from_audio_url(TEST_AUDIO_URLS[0]) + messages = dummy_messages_from_audio_url(TEST_AUDIO_URLS[:num_items]) else: raise NotImplementedError(modality) @@ -254,6 +255,7 @@ def test_mm_processing_gpu_bad_device(model_id, modality, mm_init_kwargs): max_num_seqs=2, enforce_eager=True, seed=0, + limit_mm_per_prompt={modality: num_items}, mm_processor_kwargs=mm_init_kwargs, ) diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index f861013b5dfe..6996acb7f2c2 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -52,7 +52,7 @@ def base64_encoded_audio() -> dict[str, str]: } -def get_dummy_messages_from_audio_url( +def dummy_messages_from_audio_url( audio_urls: Union[str, list[str]], content_text: str = "What's happening in this audio?", ): @@ -82,7 +82,7 @@ def get_dummy_messages_from_audio_url( @pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]]) async def test_single_chat_session_audio(client: openai.AsyncOpenAI, model_name: str, audio_url: str): - messages = get_dummy_messages_from_audio_url(audio_url) + messages = dummy_messages_from_audio_url(audio_url) # test single completion chat_completion = await client.chat.completions.create( @@ -122,7 +122,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI, async def test_error_on_invalid_audio_url_type(client: openai.AsyncOpenAI, model_name: str, audio_url: str): - messages = get_dummy_messages_from_audio_url(audio_url) + messages = dummy_messages_from_audio_url(audio_url) # audio_url should be a dict {"url": "some url"}, not directly a string with pytest.raises(openai.BadRequestError): @@ -139,7 +139,7 @@ async def test_single_chat_session_audio_base64encoded( client: openai.AsyncOpenAI, model_name: str, audio_url: str, base64_encoded_audio: dict[str, str]): - messages = get_dummy_messages_from_audio_url( + messages = dummy_messages_from_audio_url( f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}") # test single completion @@ -235,7 +235,7 @@ async def test_single_chat_session_input_audio( @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) async def test_chat_streaming_audio(client: openai.AsyncOpenAI, model_name: str, audio_url: str): - messages = messages = get_dummy_messages_from_audio_url(audio_url) + messages = messages = dummy_messages_from_audio_url(audio_url) # test single completion chat_completion = await client.chat.completions.create( @@ -339,7 +339,7 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI, async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str, audio_urls: list[str]): - messages = get_dummy_messages_from_audio_url(audio_urls) + messages = dummy_messages_from_audio_url(audio_urls) if len(audio_urls) > MAXIMUM_AUDIOS: with pytest.raises(openai.BadRequestError): # test multi-audio input diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 88732edd6a85..98eed933a711 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -79,7 +79,7 @@ def base64_encoded_image() -> dict[str, str]: } -def get_dummy_messages_from_image_url( +def dummy_messages_from_image_url( image_urls: Union[str, list[str]], content_text: str = "What's in this image?", ): @@ -129,7 +129,7 @@ def get_hf_prompt_tokens(model_name, content, image_url): async def test_single_chat_session_image(client: openai.AsyncOpenAI, model_name: str, image_url: str): content_text = "What's in this image?" - messages = get_dummy_messages_from_image_url(image_url, content_text) + messages = dummy_messages_from_image_url(image_url, content_text) max_completion_tokens = 10 # test single completion @@ -175,7 +175,7 @@ async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI, model_name: str, image_url: str): content_text = "What's in this image?" - messages = get_dummy_messages_from_image_url(image_url, content_text) + messages = dummy_messages_from_image_url(image_url, content_text) # image_url should be a dict {"url": "some url"}, not directly a string with pytest.raises(openai.BadRequestError): @@ -192,7 +192,7 @@ async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI, model_name: str, image_url: str): content_text = "What's in this image?" - messages = get_dummy_messages_from_image_url(image_url, content_text) + messages = dummy_messages_from_image_url(image_url, content_text) chat_completion = await client.chat.completions.create( model=model_name, @@ -214,7 +214,7 @@ async def test_single_chat_session_image_base64encoded( client: openai.AsyncOpenAI, model_name: str, image_url: str, base64_encoded_image: dict[str, str]): content_text = "What's in this image?" - messages = get_dummy_messages_from_image_url( + messages = dummy_messages_from_image_url( f"data:image/jpeg;base64,{base64_encoded_image[image_url]}", content_text, ) @@ -267,7 +267,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch( image_url = TEST_IMAGE_URLS[image_idx] expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx] - messages = get_dummy_messages_from_image_url( + messages = dummy_messages_from_image_url( f"data:image/jpeg;base64,{base64_encoded_image[image_url]}") chat_completion = await client.chat.completions.create( @@ -287,7 +287,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch( @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) async def test_chat_streaming_image(client: openai.AsyncOpenAI, model_name: str, image_url: str): - messages = get_dummy_messages_from_image_url(image_url) + messages = dummy_messages_from_image_url(image_url) # test single completion chat_completion = await client.chat.completions.create( @@ -331,7 +331,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI, [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))]) async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str, image_urls: list[str]): - messages = get_dummy_messages_from_image_url(image_urls) + messages = dummy_messages_from_image_url(image_urls) if len(image_urls) > MAXIMUM_IMAGES: with pytest.raises(openai.BadRequestError): # test multi-image input From de7549c16bf9d43b767572c7120a481e9e0ccb30 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Aug 2025 08:58:10 +0000 Subject: [PATCH 008/130] Separate preprocessor and model batch size Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 24 ++++++++++++++-------- vllm/v1/worker/tpu_model_runner.py | 32 +++++++++++++++++++++--------- 2 files changed, 39 insertions(+), 17 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 038f3b865ca4..84905a08757d 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -606,7 +606,7 @@ def _extract_mm_kwargs( def _dummy_mm_kwargs(self, num_seqs: int) -> BatchedTensorInputs: if self.is_multimodal_raw_input_supported: dummy_data_modality, _ = self._get_modality_with_max_tokens() - return self._get_mm_dummy_batch(dummy_data_modality, num_seqs) + return self._get_mm_dummy_batch(dummy_data_modality, 1, num_seqs) return {} @@ -2203,16 +2203,20 @@ def _get_mm_dummy_params(self): "and profiled with %s %s items of the maximum feature size.", encoder_budget, max_num_mm_items, dummy_data_modality) - return dummy_data_modality, max_num_mm_items + return dummy_data_modality, max_mm_items_per_req, max_num_mm_items - def _get_mm_dummy_batch(self, modality: str, - batch_size: int) -> BatchedTensorInputs: + def _get_mm_dummy_batch( + self, + modality: str, + preprocessor_batch_size: int, + model_batch_size: int, + ) -> BatchedTensorInputs: """Dummy data for profiling and precompiling multimodal models.""" # NOTE: Use the full batch size to profile running preprocessor on GPU dummy_request_data = self.mm_registry.get_decoder_dummy_data( model_config=self.model_config, seq_len=self.max_num_tokens, - mm_counts={modality: batch_size}, + mm_counts={modality: preprocessor_batch_size}, ) dummy_mm_data = dummy_request_data.multi_modal_data @@ -2223,7 +2227,7 @@ def _get_mm_dummy_batch(self, modality: str, dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item]) batched_dummy_mm_inputs = MultiModalKwargs.batch([dummy_mm_kwargs] * - batch_size) + model_batch_size) return MultiModalKwargs.as_kwargs( batched_dummy_mm_inputs, device=self.device, @@ -2505,11 +2509,15 @@ def profile_run(self) -> None: # TODO: handle encoder-decoder models once we support them. if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0 and self.encoder_cache_size > 0): - dummy_data_modality, max_num_mm_items = self._get_mm_dummy_params() + ( + dummy_data_modality, + max_mm_items_per_req, + max_num_mm_items, + ) = self._get_mm_dummy_params() # Create dummy batch of multimodal inputs. batched_dummy_mm_inputs = self._get_mm_dummy_batch( - dummy_data_modality, max_num_mm_items) + dummy_data_modality, max_mm_items_per_req, max_num_mm_items) # Run multimodal encoder. dummy_encoder_outputs = self.model.get_multimodal_embeddings( diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 6458958066d3..a74f16949813 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -295,7 +295,8 @@ def __init__( 0, 32, device="cpu", pin_memory=self.pin_memory) # Get maximum number of mm items per modality (batch size). - self.max_num_mm_items_by_modality = dict() + self.max_num_mm_items_per_req_by_modality = dict[str, int]() + self.max_num_mm_items_by_modality = dict[str, int]() if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0 and self.encoder_cache_size > 0): max_tokens_by_modality_dict = ( @@ -314,6 +315,8 @@ def __init__( # the decoder budget. max_mm_items_per_req = self.mm_registry.\ get_mm_limits_per_prompt(self.model_config)[modality] + self.max_num_mm_items_per_req_by_modality[modality] = \ + max_mm_items_per_req # NOTE: We do not consider max_num_batched_tokens on purpose # because the multimodal embeddings can be generated in advance @@ -1336,8 +1339,11 @@ def _set_active_loras(self, prompt_lora_mapping, token_lora_mapping, def _precompile_mm_encoder(self) -> None: # Pre-compile MM encoder for all supported data modalities. hf_config = self.vllm_config.model_config.hf_config - for mode, max_items_by_mode in \ - self.max_num_mm_items_by_modality.items(): + max_num_mm_items_per_req_by_modality = \ + self.max_num_mm_items_per_req_by_modality + max_num_mm_items_by_modality = self.max_num_mm_items_by_modality + + for mode, max_items_by_mode in max_num_mm_items_by_modality.items(): logger.info( "Compiling Multimodal %s Encoder with different input" " shapes.", mode) @@ -1346,7 +1352,8 @@ def _precompile_mm_encoder(self) -> None: for num_items in range(1, max_items_by_mode + 1): logger.info(" -- mode: %s items: %d", mode, num_items) batched_dummy_mm_inputs = self._get_mm_dummy_batch( - mode, num_items) + mode, max_num_mm_items_per_req_by_modality[mode], + num_items) # Run multimodal encoder. xm.mark_step() mm_embeds = self.model.\ @@ -1556,6 +1563,9 @@ def profile_run( dummy_data_modality, max_num_mm_items = max( self.max_num_mm_items_by_modality.items(), key=lambda t: t[1]) + max_mm_items_per_req = self.max_num_mm_items_per_req_by_modality[ + dummy_data_modality] + encoder_budget = min(self.max_num_encoder_input_tokens, self.encoder_cache_size) @@ -1566,7 +1576,7 @@ def profile_run( # Create dummy batch of multimodal inputs. batched_dummy_mm_inputs = self._get_mm_dummy_batch( - dummy_data_modality, max_num_mm_items) + dummy_data_modality, max_mm_items_per_req, max_num_mm_items) # Run multimodal encoder. # Isolate encoder graph from post-processing to minimize @@ -1808,14 +1818,18 @@ def prepare_structured_decoding_input( self.grammar_bitmask_cpu[:num_reqs].to(logits.device), \ self.structured_decode_arange.to(logits.device) - def _get_mm_dummy_batch(self, modality: str, - batch_size: int) -> BatchedTensorInputs: + def _get_mm_dummy_batch( + self, + modality: str, + preprocessor_batch_size: int, + model_batch_size: int, + ) -> BatchedTensorInputs: """Dummy data for profiling and precompiling multimodal models.""" # NOTE: Use the full batch size to profile running preprocessor on GPU dummy_request_data = self.mm_registry.get_decoder_dummy_data( model_config=self.model_config, seq_len=self.max_num_tokens, - mm_counts={modality: batch_size}, + mm_counts={modality: preprocessor_batch_size}, ) dummy_mm_data = dummy_request_data.multi_modal_data @@ -1826,7 +1840,7 @@ def _get_mm_dummy_batch(self, modality: str, dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item]) batched_dummy_mm_inputs = MultiModalKwargs.batch([dummy_mm_kwargs] * - batch_size) + model_batch_size) return MultiModalKwargs.as_kwargs( batched_dummy_mm_inputs, device=self.device, From a08240bd831209f8288f6394613079872e41c08f Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Aug 2025 09:00:00 +0000 Subject: [PATCH 009/130] Comments Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 6 ++---- vllm/v1/worker/tpu_model_runner.py | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 84905a08757d..7499c17f5dba 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2212,7 +2212,7 @@ def _get_mm_dummy_batch( model_batch_size: int, ) -> BatchedTensorInputs: """Dummy data for profiling and precompiling multimodal models.""" - # NOTE: Use the full batch size to profile running preprocessor on GPU + # This represents the maximum GPU consumption of HF processor dummy_request_data = self.mm_registry.get_decoder_dummy_data( model_config=self.model_config, seq_len=self.max_num_tokens, @@ -2220,9 +2220,7 @@ def _get_mm_dummy_batch( ) dummy_mm_data = dummy_request_data.multi_modal_data - # When models have a merged processor, their dummy data is - # already batched `MultiModalKwargs`, therefore we take the first - # `MultiModalKwargsItem` from the desired modality to profile on. + # This represents the maximum GPU consumption of the model dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0) dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item]) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index a74f16949813..7da0accdf9a1 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1825,7 +1825,7 @@ def _get_mm_dummy_batch( model_batch_size: int, ) -> BatchedTensorInputs: """Dummy data for profiling and precompiling multimodal models.""" - # NOTE: Use the full batch size to profile running preprocessor on GPU + # This represents the maximum GPU consumption of HF processor dummy_request_data = self.mm_registry.get_decoder_dummy_data( model_config=self.model_config, seq_len=self.max_num_tokens, @@ -1833,9 +1833,7 @@ def _get_mm_dummy_batch( ) dummy_mm_data = dummy_request_data.multi_modal_data - # When models have a merged processor, their dummy data is - # already batched `MultiModalKwargs`, therefore we take the first - # `MultiModalKwargsItem` from the desired modality to profile on. + # This represents the maximum GPU consumption of the model dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0) dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item]) From 1f2b4c4dcaca3cf8ca3c57682519ce73f6c1194c Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Aug 2025 09:08:26 +0000 Subject: [PATCH 010/130] Rename Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 4 ++-- vllm/v1/worker/tpu_model_runner.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 7499c17f5dba..cb1fdb7e5458 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2208,7 +2208,7 @@ def _get_mm_dummy_params(self): def _get_mm_dummy_batch( self, modality: str, - preprocessor_batch_size: int, + processor_batch_size: int, model_batch_size: int, ) -> BatchedTensorInputs: """Dummy data for profiling and precompiling multimodal models.""" @@ -2216,7 +2216,7 @@ def _get_mm_dummy_batch( dummy_request_data = self.mm_registry.get_decoder_dummy_data( model_config=self.model_config, seq_len=self.max_num_tokens, - mm_counts={modality: preprocessor_batch_size}, + mm_counts={modality: processor_batch_size}, ) dummy_mm_data = dummy_request_data.multi_modal_data diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 7da0accdf9a1..a391894c5a18 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1821,7 +1821,7 @@ def prepare_structured_decoding_input( def _get_mm_dummy_batch( self, modality: str, - preprocessor_batch_size: int, + processor_batch_size: int, model_batch_size: int, ) -> BatchedTensorInputs: """Dummy data for profiling and precompiling multimodal models.""" @@ -1829,7 +1829,7 @@ def _get_mm_dummy_batch( dummy_request_data = self.mm_registry.get_decoder_dummy_data( model_config=self.model_config, seq_len=self.max_num_tokens, - mm_counts={modality: preprocessor_batch_size}, + mm_counts={modality: processor_batch_size}, ) dummy_mm_data = dummy_request_data.multi_modal_data From dbd61591e59a7d75438e7453ead9b6245b27652b Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Aug 2025 12:30:05 +0000 Subject: [PATCH 011/130] Update docs Signed-off-by: DarkLight1337 --- docs/configuration/conserving_memory.md | 2 ++ docs/configuration/optimization.md | 45 ++++++++++++++++++++++++- 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md index 75d19e4420f4..02c0d7e28cf0 100644 --- a/docs/configuration/conserving_memory.md +++ b/docs/configuration/conserving_memory.md @@ -87,6 +87,8 @@ llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", If you run out of CPU RAM, try the following options: - (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB). + The actual memory usage is double of this value because the cache is mirrored across API and engine core processes. + You can also disable the cache entirely via the `disable_mm_preprocessor_cache` flag. - (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB). ## Multi-modal input limits diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index 9576125b86f7..72d918170325 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -129,7 +129,45 @@ Data parallelism replicates the entire model across multiple GPU sets and proces Data parallelism can be combined with the other parallelism strategies and is set by `data_parallel_size=N`. Note that MoE layers will be sharded according to the product of the tensor parallel size and data parallel size. -### Multi-modal processing using GPU +## Multi-modal processing + +### Multi-modal processor cache + +By default, the multi-modal processor cache is enabled to avoid repeatedly calling Hugging Face processors +on the same multi-modal inputs, which commonly occurs in multi-turn conversations. + +You can adjust the size of the cache via `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB). +The actual memory usage is double of this value because the cache is mirrored across API and engine core processes. + +If you do not benefit much from the cache, you can disable it explicitly via `disable_mm_preprocessor_cache`: + +```python +llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", + disable_mm_preprocessor_cache=True) +``` + +### Parallel multi-modal processing + +You can run input processing in parallel via [API server scale-out](../serving/data_parallel_deployment.md#internal-load-balancing). +This is useful when input processing (which is run inside the API server) +becomes a bottleneck compared to model execution (which is run inside engine core). + +```console +# Run 4 API processes and 1 engine core process +vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4 + +# Run 4 API processes and 2 engine core processes +vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4 -dp 2 +``` + +!!! note + API server scale-out is only available for online inference. + +!!! note + Multi-modal processing cache is disabled when API server scale-out is enabled + because it requires a one-to-one correspondance between API and engine core processes. + +### GPU multi-modal processing You can speed up input processing by running Hugging Face processors on the GPU. To support this, the processor must accept a `device` argument in its call signature. @@ -151,3 +189,8 @@ llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", llm = LLM(model="Qwen/Qwen2-Audio-7B-Instruct", mm_processor_kwargs={"device": "cuda"}) ``` + +!!! note + Multi-modal processing cache is disabled when using GPU multi-modal processing + because GPU operations work better with larger batch size which happens less + frequently when the cache is enabled. From c7c6806eb369c6645e2ba35105b869a061fdc57d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Aug 2025 12:31:35 +0000 Subject: [PATCH 012/130] Fix tests Signed-off-by: DarkLight1337 --- tests/entrypoints/openai/test_audio.py | 15 ++++++++++++++- tests/entrypoints/openai/test_vision.py | 15 ++++++++++++++- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index 6996acb7f2c2..ad2bf829674c 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -122,7 +122,20 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI, async def test_error_on_invalid_audio_url_type(client: openai.AsyncOpenAI, model_name: str, audio_url: str): - messages = dummy_messages_from_audio_url(audio_url) + messages = [{ + "role": + "user", + "content": [ + { + "type": "audio_url", + "audio_url": audio_url + }, + { + "type": "text", + "text": "What's happening in this audio?" + }, + ], + }] # audio_url should be a dict {"url": "some url"}, not directly a string with pytest.raises(openai.BadRequestError): diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 98eed933a711..6317b3ab246d 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -175,7 +175,20 @@ async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI, model_name: str, image_url: str): content_text = "What's in this image?" - messages = dummy_messages_from_image_url(image_url, content_text) + messages = [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": image_url + }, + { + "type": "text", + "text": content_text + }, + ], + }] # image_url should be a dict {"url": "some url"}, not directly a string with pytest.raises(openai.BadRequestError): From 1d1b4191233b35098fbedd1b59bb5d9f37527a11 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Aug 2025 12:51:10 +0000 Subject: [PATCH 013/130] Use async d2h Signed-off-by: DarkLight1337 --- vllm/inputs/registry.py | 76 +++++++++++++++++++++++++---------------- 1 file changed, 47 insertions(+), 29 deletions(-) diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index ce47bc928c4b..8a47425c22de 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -141,6 +141,40 @@ def get_hf_processor( **kwargs, ) + def _postprocess_output( + self, + output: JSONTree, + *, + is_gpu: bool, + ) -> JSONTree: + + def _postprocess_one(x: object): + if isinstance(x, torch.Tensor): + # This mimics the behavior of transformers.BatchFeature + if x.is_floating_point(): + x = x.to(dtype=self.model_config.dtype) + + # This is required because we need to transfer the data + # to engine core, and the serialization process expects + # CPU tensors. + # The dtype of model config is usually lower precision + # so we call this last to transfer less data to CPU + if is_gpu: + x = x.to(device="cpu", non_blocking=True) + + return x + + output = json_map_leaves(_postprocess_one, output) + + # GPU -> CPU requires explicit synchronization + if is_gpu: + from vllm.platforms import current_platform + synchronize = current_platform.synchronize + if synchronize is not None: + synchronize() + + return output + def call_hf_processor( self, hf_processor: ProcessorMixin, @@ -164,45 +198,29 @@ def call_hf_processor( ) is_gpu = allowed_kwargs.get("device", "cpu") != "cpu" - def maybe_cast_dtype(x): - if isinstance(x, torch.Tensor): - # This mimics the behavior of transformers.BatchFeature - if x.is_floating_point(): - x = x.to(dtype=self.model_config.dtype) - - # This is required because we need to transfer the data - # to engine core, and the serialization process expects - # CPU tensors. - # The dtype of model config is usually lower precision - # so we call this last to transfer less data to CPU - if is_gpu: - x = x.cpu() - - return x - try: output = hf_processor(**data, **allowed_kwargs, return_tensors="pt") - # this emulates output.to(dtype=self.model_config.dtype) - if isinstance(output, BatchFeature): - cast_output = json_map_leaves(maybe_cast_dtype, output.data) - return BatchFeature(cast_output) - - cast_output = json_map_leaves(maybe_cast_dtype, output) - - logger.warning_once( - f"{type(hf_processor).__name__} did not return `BatchFeature`. " - "Make sure to match the behaviour of `ProcessorMixin` when " - "implementing custom processors.") - return cast_output - except Exception as exc: msg = (f"Failed to apply {type(hf_processor).__name__} " f"on data={data} with kwargs={allowed_kwargs}") raise ValueError(msg) from exc + if isinstance(output, BatchFeature): + output_ = self._postprocess_output(output.data, is_gpu=is_gpu) + return BatchFeature(output_) + + logger.warning_once( + "%s did not return `BatchFeature`. " + "Make sure to match the behaviour of `ProcessorMixin` when " + "implementing custom processors.", + type(hf_processor).__name__, + ) + + return self._postprocess_output(output, is_gpu=is_gpu) + class DummyData(NamedTuple): """ From 6147bec5ca7a94e39aee6cf86451fadda52d0420 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Aug 2025 13:20:54 +0000 Subject: [PATCH 014/130] Reword Signed-off-by: DarkLight1337 --- docs/configuration/optimization.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index 72d918170325..2b3a3f9120b5 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -131,7 +131,7 @@ Note that MoE layers will be sharded according to the product of the tensor para ## Multi-modal processing -### Multi-modal processor cache +### Processor Cache By default, the multi-modal processor cache is enabled to avoid repeatedly calling Hugging Face processors on the same multi-modal inputs, which commonly occurs in multi-turn conversations. @@ -146,7 +146,7 @@ llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", disable_mm_preprocessor_cache=True) ``` -### Parallel multi-modal processing +### Parallel Processing You can run input processing in parallel via [API server scale-out](../serving/data_parallel_deployment.md#internal-load-balancing). This is useful when input processing (which is run inside the API server) @@ -164,10 +164,10 @@ vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4 -dp 2 API server scale-out is only available for online inference. !!! note - Multi-modal processing cache is disabled when API server scale-out is enabled + Multi-modal processor cache is disabled when API server scale-out is enabled because it requires a one-to-one correspondance between API and engine core processes. -### GPU multi-modal processing +### GPU-accelerated Processing You can speed up input processing by running Hugging Face processors on the GPU. To support this, the processor must accept a `device` argument in its call signature. @@ -191,6 +191,6 @@ llm = LLM(model="Qwen/Qwen2-Audio-7B-Instruct", ``` !!! note - Multi-modal processing cache is disabled when using GPU multi-modal processing + Multi-modal processor cache is disabled when using GPU multi-modal processing because GPU operations work better with larger batch size which happens less frequently when the cache is enabled. From a18af4bf407329ad49c17d69ccf67095d1f0fbaf Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Aug 2025 13:22:22 +0000 Subject: [PATCH 015/130] Reword Signed-off-by: DarkLight1337 --- docs/configuration/optimization.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index 2b3a3f9120b5..13cbe702d6d8 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -192,5 +192,5 @@ llm = LLM(model="Qwen/Qwen2-Audio-7B-Instruct", !!! note Multi-modal processor cache is disabled when using GPU multi-modal processing - because GPU operations work better with larger batch size which happens less + because GPU operations work better with larger batch size, which happens less frequently when the cache is enabled. From 55b90aa2ddc480890dc5bec90b14a1f9861cf8da Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Aug 2025 13:23:29 +0000 Subject: [PATCH 016/130] Reword Signed-off-by: DarkLight1337 --- docs/configuration/optimization.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index 13cbe702d6d8..d877daef60f7 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -129,7 +129,7 @@ Data parallelism replicates the entire model across multiple GPU sets and proces Data parallelism can be combined with the other parallelism strategies and is set by `data_parallel_size=N`. Note that MoE layers will be sharded according to the product of the tensor parallel size and data parallel size. -## Multi-modal processing +## Multi-modal Processing ### Processor Cache From ef1ec3854224dd8ff8a20e7c9a0ed09422b760f3 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Aug 2025 15:35:32 +0000 Subject: [PATCH 017/130] Reorganize Signed-off-by: DarkLight1337 --- docs/configuration/optimization.md | 2 +- vllm/v1/worker/gpu_model_runner.py | 27 ++++++++++++++------------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index d877daef60f7..6ed8b3c6eef5 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -139,7 +139,7 @@ on the same multi-modal inputs, which commonly occurs in multi-turn conversation You can adjust the size of the cache via `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB). The actual memory usage is double of this value because the cache is mirrored across API and engine core processes. -If you do not benefit much from the cache, you can disable it explicitly via `disable_mm_preprocessor_cache`: +If you do not benefit much from the cache, you can disable it completely via `disable_mm_preprocessor_cache`: ```python llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index cb1fdb7e5458..8dcc76e6515e 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1519,17 +1519,18 @@ def execute_model( # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. - input_ids = self.input_ids[:num_scheduled_tokens] - inputs_embeds = self.model.get_input_embeddings( - input_ids=input_ids, + inputs_embeds_scheduled = self.model.get_input_embeddings( + input_ids=self.input_ids[:num_scheduled_tokens], multimodal_embeddings=mm_embeds or None, ) # TODO(woosuk): Avoid the copy. Optimize. - self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds) - inputs_embeds = self.inputs_embeds[:num_input_tokens] + self.inputs_embeds[:num_scheduled_tokens].copy_( + inputs_embeds_scheduled) + input_ids = None - model_kwargs = self._extract_mm_kwargs(scheduler_output) + inputs_embeds = self.inputs_embeds[:num_input_tokens] + model_mm_kwargs = self._extract_mm_kwargs(scheduler_output) else: # For text-only models, we use token ids as input. # While it is possible to use embeddings as input just like the @@ -1537,7 +1538,8 @@ def execute_model( # then the embedding layer is not included in the CUDA graph. input_ids = self.input_ids[:num_input_tokens] inputs_embeds = None - model_kwargs = {} + model_mm_kwargs = {} + if self.uses_mrope: positions = self.mrope_positions[:, :num_input_tokens] else: @@ -1571,7 +1573,7 @@ def execute_model( intermediate_tensors=intermediate_tensors, inputs_embeds=inputs_embeds, **MultiModalKwargs.as_kwargs( - model_kwargs, + model_mm_kwargs, device=self.device, ), ) @@ -2295,15 +2297,14 @@ def _dummy_run( with self.maybe_dummy_run_with_lora(self.lora_config, num_scheduled_tokens): - model = self.model if self.is_multimodal_model: input_ids = None inputs_embeds = self.inputs_embeds[:num_tokens] - model_kwargs = self._dummy_mm_kwargs(num_reqs) + model_mm_kwargs = self._dummy_mm_kwargs(num_reqs) else: input_ids = self.input_ids[:num_tokens] inputs_embeds = None - model_kwargs = {} + model_mm_kwargs = {} if self.uses_mrope: positions = self.mrope_positions[:, :num_tokens] @@ -2328,13 +2329,13 @@ def _dummy_run( self.vllm_config, num_tokens=num_tokens, num_tokens_across_dp=num_tokens_across_dp): - outputs = model( + outputs = self.model( input_ids=input_ids, positions=positions, intermediate_tensors=intermediate_tensors, inputs_embeds=inputs_embeds, **MultiModalKwargs.as_kwargs( - model_kwargs, + model_mm_kwargs, device=self.device, ), ) From 6937d5ed98d3a28e0d898ebf9f382fd0c38a67b6 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Aug 2025 16:31:00 +0000 Subject: [PATCH 018/130] Fix incorrect batch size causing hang Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 85 +++++++++-------- vllm/v1/worker/tpu_model_runner.py | 147 ++++++++++++++++++----------- 2 files changed, 137 insertions(+), 95 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 8dcc76e6515e..2e54583bbfd7 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -605,8 +605,8 @@ def _extract_mm_kwargs( def _dummy_mm_kwargs(self, num_seqs: int) -> BatchedTensorInputs: if self.is_multimodal_raw_input_supported: - dummy_data_modality, _ = self._get_modality_with_max_tokens() - return self._get_mm_dummy_batch(dummy_data_modality, 1, num_seqs) + dummy_modality, _ = self._get_modality_with_max_tokens() + return self._get_mm_dummy_batch(dummy_modality, 1, num_seqs) return {} @@ -2160,52 +2160,49 @@ def rand_input_ids() -> torch.Tensor: input_ids.fill_(0) def _get_modality_with_max_tokens(self): - # NOTE: Currently model is profiled with a single non-text - # modality with the max possible input tokens even when - # it supports multiple. - max_tokens_by_modality_dict = self.mm_registry \ + max_tokens_by_modality = self.mm_registry \ .get_max_tokens_per_item_by_nonzero_modality(self.model_config) - dummy_data_modality, max_tokens_per_mm_item = max( - max_tokens_by_modality_dict.items(), key=lambda item: item[1]) + dummy_modality, max_tokens_per_mm_item = max( + max_tokens_by_modality.items(), key=lambda item: item[1]) - return dummy_data_modality, max_tokens_per_mm_item + return dummy_modality, max_tokens_per_mm_item - def _get_mm_dummy_params(self): - ( - dummy_data_modality, - max_tokens_per_mm_item, - ) = self._get_modality_with_max_tokens() + def _get_encoder_budget(self) -> int: + return min(self.max_num_encoder_input_tokens, self.encoder_cache_size) + def _get_max_mm_items( + self, + modality: str, + max_tokens_per_mm_item: int, + ): # Check how many items of this modality can be supported by # the encoder budget. - encoder_budget = min(self.max_num_encoder_input_tokens, - self.encoder_cache_size) + encoder_budget = self._get_encoder_budget() - max_num_mm_items_encoder_budget = encoder_budget // \ - max_tokens_per_mm_item + max_encoder_mm_items = encoder_budget // max_tokens_per_mm_item # Check how many items of this modality can be supported by # the decoder budget. - max_mm_items_per_req = self.mm_registry.get_mm_limits_per_prompt( - self.model_config)[dummy_data_modality] + mm_limits = self.mm_registry.get_mm_limits_per_prompt( + self.model_config) + mm_limit = mm_limits[modality] + + max_mm_items_per_prompt = max( + 1, + min(mm_limit, self.max_num_tokens // max_tokens_per_mm_item), + ) # NOTE: We do not consider max_num_batched_tokens on purpose # because the multimodal embeddings can be generated in advance # and chunked prefilled. - max_num_mm_items_decoder_budget = self.max_num_reqs * \ - max_mm_items_per_req + max_decoder_mm_items = self.max_num_reqs * max_mm_items_per_prompt - max_num_mm_items = max( + max_mm_items_per_req = max( 1, - min(max_num_mm_items_encoder_budget, - max_num_mm_items_decoder_budget)) - - logger.info( - "Encoder cache will be initialized with a budget of %s tokens, " - "and profiled with %s %s items of the maximum feature size.", - encoder_budget, max_num_mm_items, dummy_data_modality) + min(max_encoder_mm_items, max_decoder_mm_items), + ) - return dummy_data_modality, max_mm_items_per_req, max_num_mm_items + return max_mm_items_per_prompt, max_mm_items_per_req def _get_mm_dummy_batch( self, @@ -2214,7 +2211,7 @@ def _get_mm_dummy_batch( model_batch_size: int, ) -> BatchedTensorInputs: """Dummy data for profiling and precompiling multimodal models.""" - # This represents the maximum GPU consumption of HF processor + # Result in the maximum GPU consumption of HF processor dummy_request_data = self.mm_registry.get_decoder_dummy_data( model_config=self.model_config, seq_len=self.max_num_tokens, @@ -2222,7 +2219,7 @@ def _get_mm_dummy_batch( ) dummy_mm_data = dummy_request_data.multi_modal_data - # This represents the maximum GPU consumption of the model + # Result in the maximum GPU consumption of the model dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0) dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item]) @@ -2508,15 +2505,27 @@ def profile_run(self) -> None: # TODO: handle encoder-decoder models once we support them. if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0 and self.encoder_cache_size > 0): + + # NOTE: Currently model is profiled with a single non-text + # modality with the max possible input tokens even when + # it supports multiple. + dummy_modality, max_tokens = self._get_modality_with_max_tokens() ( - dummy_data_modality, + max_mm_items_per_prompt, max_mm_items_per_req, - max_num_mm_items, - ) = self._get_mm_dummy_params() + ) = self._get_max_mm_items(dummy_modality, max_tokens) + + logger.info( + "Encoder cache will be initialized with a budget of %s tokens," + " and profiled with %s %s items of the maximum feature size.", + self._get_encoder_budget(), + max_mm_items_per_req, + dummy_modality, + ) # Create dummy batch of multimodal inputs. batched_dummy_mm_inputs = self._get_mm_dummy_batch( - dummy_data_modality, max_mm_items_per_req, max_num_mm_items) + dummy_modality, max_mm_items_per_prompt, max_mm_items_per_req) # Run multimodal encoder. dummy_encoder_outputs = self.model.get_multimodal_embeddings( @@ -2524,7 +2533,7 @@ def profile_run(self) -> None: sanity_check_mm_encoder_outputs( dummy_encoder_outputs, - expected_num_items=max_num_mm_items, + expected_num_items=max_mm_items_per_req, ) # Cache the dummy encoder outputs. diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index a391894c5a18..75f4d612809c 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -295,39 +295,24 @@ def __init__( 0, 32, device="cpu", pin_memory=self.pin_memory) # Get maximum number of mm items per modality (batch size). - self.max_num_mm_items_per_req_by_modality = dict[str, int]() - self.max_num_mm_items_by_modality = dict[str, int]() + self.max_mm_items_per_prompt_by_modality = dict[str, int]() + self.max_mm_items_per_seq_by_modality = dict[str, int]() if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0 and self.encoder_cache_size > 0): - max_tokens_by_modality_dict = ( - MULTIMODAL_REGISTRY. - get_max_tokens_per_item_by_nonzero_modality(self.model_config)) - for modality, max_tokens in max_tokens_by_modality_dict.items(): - # Check how many items of this modality can be supported by - # the encoder budget. - encoder_budget = min(self.max_num_encoder_input_tokens, - self.encoder_cache_size) - - max_num_mm_items_encoder_budget = cdiv(encoder_budget, - max_tokens) - - # Check how many items of this modality can be supported by - # the decoder budget. - max_mm_items_per_req = self.mm_registry.\ - get_mm_limits_per_prompt(self.model_config)[modality] - self.max_num_mm_items_per_req_by_modality[modality] = \ + max_tokens_by_modality = self.mm_registry \ + .get_max_tokens_per_item_by_nonzero_modality(self.model_config) + + for modality, max_tokens in max_tokens_by_modality.items(): + ( + max_mm_items_per_prompt, + max_mm_items_per_req, + ) = self._get_max_mm_items(modality, max_tokens) + + self.max_mm_items_per_prompt_by_modality[modality] = \ + max_mm_items_per_prompt + self.max_mm_items_per_seq_by_modality[modality] = \ max_mm_items_per_req - # NOTE: We do not consider max_num_batched_tokens on purpose - # because the multimodal embeddings can be generated in advance - # and chunked prefilled. - max_num_mm_items_decoder_budget = self.max_num_reqs * \ - max_mm_items_per_req - - max_num_mm_items = min(max_num_mm_items_encoder_budget, - max_num_mm_items_decoder_budget) - self.max_num_mm_items_by_modality[modality] = max_num_mm_items - if not self.use_spmd: self.sample_from_logits_func = torch.compile( self.sample_from_logits, @@ -1339,25 +1324,30 @@ def _set_active_loras(self, prompt_lora_mapping, token_lora_mapping, def _precompile_mm_encoder(self) -> None: # Pre-compile MM encoder for all supported data modalities. hf_config = self.vllm_config.model_config.hf_config - max_num_mm_items_per_req_by_modality = \ - self.max_num_mm_items_per_req_by_modality - max_num_mm_items_by_modality = self.max_num_mm_items_by_modality + max_mm_items_per_prompt_by_modality = \ + self.max_mm_items_per_prompt_by_modality + max_mm_items_per_seq_by_modality = self.max_mm_items_per_seq_by_modality + + for mode, max_items_per_seq in max_mm_items_per_seq_by_modality.items( + ): + max_items_per_prompt = max_mm_items_per_prompt_by_modality[mode] - for mode, max_items_by_mode in max_num_mm_items_by_modality.items(): logger.info( "Compiling Multimodal %s Encoder with different input" " shapes.", mode) start = time.perf_counter() # No padding for MM encoder just yet. - for num_items in range(1, max_items_by_mode + 1): + for num_items in range(1, max_items_per_seq + 1): logger.info(" -- mode: %s items: %d", mode, num_items) batched_dummy_mm_inputs = self._get_mm_dummy_batch( - mode, max_num_mm_items_per_req_by_modality[mode], - num_items) + mode, + max_items_per_prompt, + num_items, + ) # Run multimodal encoder. xm.mark_step() - mm_embeds = self.model.\ - get_multimodal_embeddings(**batched_dummy_mm_inputs) + mm_embeds = self.model.get_multimodal_embeddings( + **batched_dummy_mm_inputs) xm.mark_step() num_patches = mm_embeds[0].shape[0] items_size = num_patches * num_items @@ -1560,23 +1550,23 @@ def profile_run( # NOTE: Currently model is profiled with a single non-text # modality with the max possible input tokens even when # it supports multiple. - dummy_data_modality, max_num_mm_items = max( - self.max_num_mm_items_by_modality.items(), key=lambda t: t[1]) - - max_mm_items_per_req = self.max_num_mm_items_per_req_by_modality[ - dummy_data_modality] - - encoder_budget = min(self.max_num_encoder_input_tokens, - self.encoder_cache_size) + dummy_modality, max_tokens = self._get_modality_with_max_tokens() + ( + max_mm_items_per_prompt, + max_mm_items_per_req, + ) = self._get_max_mm_items(dummy_modality, max_tokens) logger.info( - "Encoder cache will be initialized with a budget of %d tokens," + "Encoder cache will be initialized with a budget of %s tokens," " and profiled with %s %s items of the maximum feature size.", - encoder_budget, max_num_mm_items, dummy_data_modality) + self._get_encoder_budget(), + max_mm_items_per_req, + dummy_modality, + ) # Create dummy batch of multimodal inputs. batched_dummy_mm_inputs = self._get_mm_dummy_batch( - dummy_data_modality, max_mm_items_per_req, max_num_mm_items) + dummy_modality, max_mm_items_per_prompt, max_mm_items_per_req) # Run multimodal encoder. # Isolate encoder graph from post-processing to minimize @@ -1592,12 +1582,10 @@ def profile_run( "Multimodal Encoder profiling finished in in %.2f [secs].", end - start) - assert len(dummy_encoder_outputs) == max_num_mm_items, ( - "Expected dimension 0 of encoder outputs to match the number " - f"of multimodal data items: {max_num_mm_items}, got " - f"{len(dummy_encoder_outputs)=} instead. This is most likely " - "due to the 'get_multimodal_embeddings' method of the model " - "not implemented correctly.") + sanity_check_mm_encoder_outputs( + dummy_encoder_outputs, + expected_num_items=max_mm_items_per_req, + ) # Cache the dummy encoder outputs. self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs)) @@ -1818,6 +1806,51 @@ def prepare_structured_decoding_input( self.grammar_bitmask_cpu[:num_reqs].to(logits.device), \ self.structured_decode_arange.to(logits.device) + def _get_modality_with_max_tokens(self): + max_tokens_by_modality = self.mm_registry \ + .get_max_tokens_per_item_by_nonzero_modality(self.model_config) + dummy_modality, max_tokens_per_mm_item = max( + max_tokens_by_modality.items(), key=lambda item: item[1]) + + return dummy_modality, max_tokens_per_mm_item + + def _get_encoder_budget(self) -> int: + return min(self.max_num_encoder_input_tokens, self.encoder_cache_size) + + def _get_max_mm_items( + self, + modality: str, + max_tokens_per_mm_item: int, + ): + # Check how many items of this modality can be supported by + # the encoder budget. + encoder_budget = self._get_encoder_budget() + + max_encoder_mm_items = encoder_budget // max_tokens_per_mm_item + + # Check how many items of this modality can be supported by + # the decoder budget. + mm_limits = self.mm_registry.get_mm_limits_per_prompt( + self.model_config) + mm_limit = mm_limits[modality] + + max_mm_items_per_prompt = max( + 1, + min(mm_limit, self.max_num_tokens // max_tokens_per_mm_item), + ) + + # NOTE: We do not consider max_num_batched_tokens on purpose + # because the multimodal embeddings can be generated in advance + # and chunked prefilled. + max_decoder_mm_items = self.max_num_reqs * max_mm_items_per_prompt + + max_mm_items_per_req = max( + 1, + min(max_encoder_mm_items, max_decoder_mm_items), + ) + + return max_mm_items_per_prompt, max_mm_items_per_req + def _get_mm_dummy_batch( self, modality: str, @@ -1825,7 +1858,7 @@ def _get_mm_dummy_batch( model_batch_size: int, ) -> BatchedTensorInputs: """Dummy data for profiling and precompiling multimodal models.""" - # This represents the maximum GPU consumption of HF processor + # Result in the maximum GPU consumption of HF processor dummy_request_data = self.mm_registry.get_decoder_dummy_data( model_config=self.model_config, seq_len=self.max_num_tokens, @@ -1833,7 +1866,7 @@ def _get_mm_dummy_batch( ) dummy_mm_data = dummy_request_data.multi_modal_data - # This represents the maximum GPU consumption of the model + # Result in the maximum GPU consumption of the model dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0) dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item]) From 43577360c2834abdb83eaecbc74392cc6f83d135 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Aug 2025 16:42:51 +0000 Subject: [PATCH 019/130] Fix Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 2 +- vllm/v1/worker/tpu_model_runner.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 2e54583bbfd7..4088b0cb17f3 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2189,7 +2189,7 @@ def _get_max_mm_items( max_mm_items_per_prompt = max( 1, - min(mm_limit, self.max_num_tokens // max_tokens_per_mm_item), + min(mm_limit, self.max_model_len // max_tokens_per_mm_item), ) # NOTE: We do not consider max_num_batched_tokens on purpose diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 75f4d612809c..9b7830883248 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1836,7 +1836,7 @@ def _get_max_mm_items( max_mm_items_per_prompt = max( 1, - min(mm_limit, self.max_num_tokens // max_tokens_per_mm_item), + min(mm_limit, self.max_model_len // max_tokens_per_mm_item), ) # NOTE: We do not consider max_num_batched_tokens on purpose From b1d9367fa643ab474bde4fb051cd874b0689a315 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 2 Aug 2025 07:15:56 +0000 Subject: [PATCH 020/130] Consolidate budget calculation Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 145 +++++++++------------- vllm/v1/worker/tpu_model_runner.py | 190 +++++++++++------------------ vllm/v1/worker/utils.py | 99 +++++++++++++++ 3 files changed, 225 insertions(+), 209 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 085d54a4febe..dc7ed04bd109 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -51,7 +51,6 @@ AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, make_kv_sharing_fast_prefill_attention_metadata, make_local_attention_virtual_batches) -from vllm.v1.core.encoder_cache_manager import compute_encoder_budget from vllm.v1.kv_cache_interface import (AttentionSpec, ChunkedLocalAttentionSpec, FullAttentionSpec, KVCacheConfig, @@ -73,7 +72,7 @@ from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin from ..sample.logits_processor import LogitsProcessorManager -from .utils import (bind_kv_cache, gather_mm_placeholders, +from .utils import (MultiModalBudget, bind_kv_cache, gather_mm_placeholders, initialize_kv_cache_for_kv_sharing, sanity_check_mm_encoder_outputs, scatter_mm_placeholders) @@ -148,14 +147,6 @@ def __init__( self.mm_registry = MULTIMODAL_REGISTRY self.uses_mrope = model_config.uses_mrope - encoder_compute_budget, encoder_cache_size = compute_encoder_budget( - model_config=model_config, - scheduler_config=scheduler_config, - mm_registry=self.mm_registry, - ) - self.max_num_encoder_input_tokens = encoder_compute_budget - self.encoder_cache_size = encoder_cache_size - # Sampler self.sampler = Sampler(logprobs_mode=self.model_config.logprobs_mode) @@ -330,6 +321,14 @@ def __init__( self.kv_sharing_fast_prefill_logits_indices = torch.zeros( self.max_num_tokens, dtype=torch.int32, device=self.device) + self.mm_budget = (MultiModalBudget( + self.model_config, + self.scheduler_config, + self.mm_registry, + max_model_len=self.max_model_len, + max_num_reqs=self.max_num_reqs, + ) if self.is_multimodal_model else None) + def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None: """ Update the order of requests in the batch based on the attention @@ -605,7 +604,12 @@ def _extract_mm_kwargs( def _dummy_mm_kwargs(self, num_seqs: int) -> BatchedTensorInputs: if self.is_multimodal_raw_input_supported: - dummy_modality, _ = self._get_modality_with_max_tokens() + mm_budget = self.mm_budget + assert mm_budget is not None + + dummy_modality, _ = mm_budget \ + .get_modality_with_max_tokens_per_seq() + return self._get_mm_dummy_batch(dummy_modality, 1, num_seqs) return {} @@ -2159,51 +2163,6 @@ def rand_input_ids() -> torch.Tensor: yield input_ids.fill_(0) - def _get_modality_with_max_tokens(self): - max_tokens_by_modality = self.mm_registry \ - .get_max_tokens_per_item_by_nonzero_modality(self.model_config) - dummy_modality, max_tokens_per_mm_item = max( - max_tokens_by_modality.items(), key=lambda item: item[1]) - - return dummy_modality, max_tokens_per_mm_item - - def _get_encoder_budget(self) -> int: - return min(self.max_num_encoder_input_tokens, self.encoder_cache_size) - - def _get_max_mm_items( - self, - modality: str, - max_tokens_per_mm_item: int, - ): - # Check how many items of this modality can be supported by - # the encoder budget. - encoder_budget = self._get_encoder_budget() - - max_encoder_mm_items = encoder_budget // max_tokens_per_mm_item - - # Check how many items of this modality can be supported by - # the decoder budget. - mm_limits = self.mm_registry.get_mm_limits_per_prompt( - self.model_config) - mm_limit = mm_limits[modality] - - max_mm_items_per_prompt = max( - 1, - min(mm_limit, self.max_model_len // max_tokens_per_mm_item), - ) - - # NOTE: We do not consider max_num_batched_tokens on purpose - # because the multimodal embeddings can be generated in advance - # and chunked prefilled. - max_decoder_mm_items = self.max_num_reqs * max_mm_items_per_prompt - - max_mm_items_per_req = max( - 1, - min(max_encoder_mm_items, max_decoder_mm_items), - ) - - return max_mm_items_per_prompt, max_mm_items_per_req - def _get_mm_dummy_batch( self, modality: str, @@ -2502,42 +2461,52 @@ def _dummy_pooler_run( def profile_run(self) -> None: # Profile with multimodal encoder & encoder cache. - # TODO: handle encoder-decoder models once we support them. - if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0 - and self.encoder_cache_size > 0): - - # NOTE: Currently model is profiled with a single non-text - # modality with the max possible input tokens even when - # it supports multiple. - dummy_modality, max_tokens = self._get_modality_with_max_tokens() - ( - max_mm_items_per_prompt, - max_mm_items_per_req, - ) = self._get_max_mm_items(dummy_modality, max_tokens) - - logger.info( - "Encoder cache will be initialized with a budget of %s tokens," - " and profiled with %s %s items of the maximum feature size.", - self._get_encoder_budget(), - max_mm_items_per_req, - dummy_modality, - ) + if self.is_multimodal_model: + mm_budget = self.mm_budget + assert mm_budget is not None + + # TODO: handle encoder-decoder models once we support them. + if (encoder_budget := mm_budget.get_encoder_budget()) > 0: + # NOTE: Currently model is profiled with a single non-text + # modality with the max possible input tokens even when + # it supports multiple. + ( + dummy_modality, + max_tokens, + ) = mm_budget.get_modality_with_max_tokens_per_seq() + ( + max_mm_items_per_prompt, + max_mm_items_per_req, + ) = mm_budget.get_max_items(dummy_modality, max_tokens) + + logger.info( + "Encoder cache will be initialized with a budget of " + "%s tokens, and profiled with %s %s items of the maximum " + "feature size.", + encoder_budget, + max_mm_items_per_req, + dummy_modality, + ) - # Create dummy batch of multimodal inputs. - batched_dummy_mm_inputs = self._get_mm_dummy_batch( - dummy_modality, max_mm_items_per_prompt, max_mm_items_per_req) + # Create dummy batch of multimodal inputs. + batched_dummy_mm_inputs = self._get_mm_dummy_batch( + dummy_modality, + max_mm_items_per_prompt, + max_mm_items_per_req, + ) - # Run multimodal encoder. - dummy_encoder_outputs = self.model.get_multimodal_embeddings( - **batched_dummy_mm_inputs) + # Run multimodal encoder. + dummy_encoder_outputs = self.model.get_multimodal_embeddings( + **batched_dummy_mm_inputs) - sanity_check_mm_encoder_outputs( - dummy_encoder_outputs, - expected_num_items=max_mm_items_per_req, - ) + sanity_check_mm_encoder_outputs( + dummy_encoder_outputs, + expected_num_items=max_mm_items_per_req, + ) - # Cache the dummy encoder outputs. - self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs)) + # Cache the dummy encoder outputs. + self.encoder_cache["tmp"] = dict( + enumerate(dummy_encoder_outputs)) # Add `is_profile` here to pre-allocate communication buffers hidden_states, last_hidden_states \ diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 9b7830883248..977ed33258b7 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -42,7 +42,6 @@ PallasAttentionBackend, PallasMetadata, get_page_size_bytes) -from vllm.v1.core.encoder_cache_manager import compute_encoder_budget from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec, KVCacheConfig, KVCacheSpec, SlidingWindowSpec) @@ -55,7 +54,8 @@ from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin from vllm.v1.worker.tpu_input_batch import CachedRequestState, InputBatch -from .utils import (bind_kv_cache, initialize_kv_cache_for_kv_sharing, +from .utils import (MultiModalBudget, bind_kv_cache, + initialize_kv_cache_for_kv_sharing, sanity_check_mm_encoder_outputs) if TYPE_CHECKING: @@ -195,14 +195,6 @@ def __init__( # TODO: Support M-RoPE (e.g, Qwen2-VL) assert not self.uses_mrope, "TPU does not support M-RoPE yet." - encoder_compute_budget, encoder_cache_size = compute_encoder_budget( - model_config=model_config, - scheduler_config=scheduler_config, - mm_registry=self.mm_registry, - ) - self.max_num_encoder_input_tokens = encoder_compute_budget - self.encoder_cache_size = encoder_cache_size - self._num_slices_per_kv_cache_update_block = \ _get_num_slices_per_kv_cache_update_block(get_page_size_bytes( block_size=self.block_size, @@ -294,24 +286,13 @@ def __init__( self.structured_decode_arange = torch.arange( 0, 32, device="cpu", pin_memory=self.pin_memory) - # Get maximum number of mm items per modality (batch size). - self.max_mm_items_per_prompt_by_modality = dict[str, int]() - self.max_mm_items_per_seq_by_modality = dict[str, int]() - if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0 - and self.encoder_cache_size > 0): - max_tokens_by_modality = self.mm_registry \ - .get_max_tokens_per_item_by_nonzero_modality(self.model_config) - - for modality, max_tokens in max_tokens_by_modality.items(): - ( - max_mm_items_per_prompt, - max_mm_items_per_req, - ) = self._get_max_mm_items(modality, max_tokens) - - self.max_mm_items_per_prompt_by_modality[modality] = \ - max_mm_items_per_prompt - self.max_mm_items_per_seq_by_modality[modality] = \ - max_mm_items_per_req + self.mm_budget = (MultiModalBudget( + self.model_config, + self.scheduler_config, + self.mm_registry, + max_model_len=self.max_model_len, + max_num_reqs=self.max_num_reqs, + ) if self.is_multimodal_model else None) if not self.use_spmd: self.sample_from_logits_func = torch.compile( @@ -1324,13 +1305,15 @@ def _set_active_loras(self, prompt_lora_mapping, token_lora_mapping, def _precompile_mm_encoder(self) -> None: # Pre-compile MM encoder for all supported data modalities. hf_config = self.vllm_config.model_config.hf_config - max_mm_items_per_prompt_by_modality = \ - self.max_mm_items_per_prompt_by_modality - max_mm_items_per_seq_by_modality = self.max_mm_items_per_seq_by_modality - for mode, max_items_per_seq in max_mm_items_per_seq_by_modality.items( - ): - max_items_per_prompt = max_mm_items_per_prompt_by_modality[mode] + mm_budget = self.mm_budget + assert mm_budget is not None + + max_items_per_seq_by_modality = mm_budget.max_items_per_seq_by_modality + max_items_per_prompt_by_modality = mm_budget.max_items_per_prompt_by_modality # noqa: E501 + + for mode, max_items_per_seq in max_items_per_seq_by_modality.items(): + max_items_per_prompt = max_items_per_prompt_by_modality[mode] logger.info( "Compiling Multimodal %s Encoder with different input" @@ -1543,52 +1526,62 @@ def profile_run( num_tokens: int, ) -> None: # Profile with multimodal encoder & encoder cache. - # TODO: handle encoder-decoder models once we support them. - if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0 - and self.encoder_cache_size > 0): - - # NOTE: Currently model is profiled with a single non-text - # modality with the max possible input tokens even when - # it supports multiple. - dummy_modality, max_tokens = self._get_modality_with_max_tokens() - ( - max_mm_items_per_prompt, - max_mm_items_per_req, - ) = self._get_max_mm_items(dummy_modality, max_tokens) - - logger.info( - "Encoder cache will be initialized with a budget of %s tokens," - " and profiled with %s %s items of the maximum feature size.", - self._get_encoder_budget(), - max_mm_items_per_req, - dummy_modality, - ) + if self.is_multimodal_model: + mm_budget = self.mm_budget + assert mm_budget is not None + + # TODO: handle encoder-decoder models once we support them. + if (encoder_budget := mm_budget.get_encoder_budget()) > 0: + # NOTE: Currently model is profiled with a single non-text + # modality with the max possible input tokens even when + # it supports multiple. + ( + dummy_modality, + max_tokens, + ) = mm_budget.get_modality_with_max_tokens_per_seq() + ( + max_mm_items_per_prompt, + max_mm_items_per_req, + ) = mm_budget.get_max_items(dummy_modality, max_tokens) - # Create dummy batch of multimodal inputs. - batched_dummy_mm_inputs = self._get_mm_dummy_batch( - dummy_modality, max_mm_items_per_prompt, max_mm_items_per_req) + logger.info( + "Encoder cache will be initialized with a budget of " + "%s tokens, and profiled with %s %s items of the maximum " + "feature size.", + encoder_budget, + max_mm_items_per_req, + dummy_modality, + ) - # Run multimodal encoder. - # Isolate encoder graph from post-processing to minimize - # impact of recompilation until it's fixed. - start = time.perf_counter() - xm.mark_step() - dummy_encoder_outputs = self.model.get_multimodal_embeddings( - **batched_dummy_mm_inputs) - xm.mark_step() - xm.wait_device_ops() - end = time.perf_counter() - logger.info( - "Multimodal Encoder profiling finished in in %.2f [secs].", - end - start) + # Create dummy batch of multimodal inputs. + batched_dummy_mm_inputs = self._get_mm_dummy_batch( + dummy_modality, + max_mm_items_per_prompt, + max_mm_items_per_req, + ) - sanity_check_mm_encoder_outputs( - dummy_encoder_outputs, - expected_num_items=max_mm_items_per_req, - ) + # Run multimodal encoder. + # Isolate encoder graph from post-processing to minimize + # impact of recompilation until it's fixed. + start = time.perf_counter() + xm.mark_step() + dummy_encoder_outputs = self.model.get_multimodal_embeddings( + **batched_dummy_mm_inputs) + xm.mark_step() + xm.wait_device_ops() + end = time.perf_counter() + logger.info( + "Multimodal Encoder profiling finished in in %.2f [secs].", + end - start) + + sanity_check_mm_encoder_outputs( + dummy_encoder_outputs, + expected_num_items=max_mm_items_per_req, + ) - # Cache the dummy encoder outputs. - self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs)) + # Cache the dummy encoder outputs. + self.encoder_cache["tmp"] = dict( + enumerate(dummy_encoder_outputs)) # Trigger compilation for general shape. self._dummy_run(num_tokens, self.num_reqs_max_model_len, @@ -1806,51 +1799,6 @@ def prepare_structured_decoding_input( self.grammar_bitmask_cpu[:num_reqs].to(logits.device), \ self.structured_decode_arange.to(logits.device) - def _get_modality_with_max_tokens(self): - max_tokens_by_modality = self.mm_registry \ - .get_max_tokens_per_item_by_nonzero_modality(self.model_config) - dummy_modality, max_tokens_per_mm_item = max( - max_tokens_by_modality.items(), key=lambda item: item[1]) - - return dummy_modality, max_tokens_per_mm_item - - def _get_encoder_budget(self) -> int: - return min(self.max_num_encoder_input_tokens, self.encoder_cache_size) - - def _get_max_mm_items( - self, - modality: str, - max_tokens_per_mm_item: int, - ): - # Check how many items of this modality can be supported by - # the encoder budget. - encoder_budget = self._get_encoder_budget() - - max_encoder_mm_items = encoder_budget // max_tokens_per_mm_item - - # Check how many items of this modality can be supported by - # the decoder budget. - mm_limits = self.mm_registry.get_mm_limits_per_prompt( - self.model_config) - mm_limit = mm_limits[modality] - - max_mm_items_per_prompt = max( - 1, - min(mm_limit, self.max_model_len // max_tokens_per_mm_item), - ) - - # NOTE: We do not consider max_num_batched_tokens on purpose - # because the multimodal embeddings can be generated in advance - # and chunked prefilled. - max_decoder_mm_items = self.max_num_reqs * max_mm_items_per_prompt - - max_mm_items_per_req = max( - 1, - min(max_encoder_mm_items, max_decoder_mm_items), - ) - - return max_mm_items_per_prompt, max_mm_items_per_req - def _get_mm_dummy_batch( self, modality: str, diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 3ecb1d7dd656..71322c7cca5a 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -5,14 +5,113 @@ import torch +from vllm.config import ModelConfig, SchedulerConfig from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.utils import extract_layer_index +from vllm.multimodal.registry import MultiModalRegistry +from vllm.v1.core.encoder_cache_manager import compute_encoder_budget from vllm.v1.kv_cache_interface import KVCacheGroupSpec if TYPE_CHECKING: from vllm.attention.layer import Attention +class MultiModalBudget: + """Helper class to calculate budget information for multi-modal models.""" + + def __init__( + self, + model_config: ModelConfig, + scheduler_config: SchedulerConfig, + mm_registry: MultiModalRegistry, + *, + max_model_len: int, + max_num_reqs: int, + ) -> None: + + super().__init__() + + self.model_config = model_config + self.mm_registry = mm_registry + + encoder_compute_budget, encoder_cache_size = compute_encoder_budget( + model_config=model_config, + scheduler_config=scheduler_config, + mm_registry=mm_registry, + ) + + self.max_num_encoder_input_tokens = encoder_compute_budget + self.encoder_cache_size = encoder_cache_size + self.max_model_len = max_model_len + self.max_num_reqs = max_num_reqs + + max_items_per_prompt_by_modality = dict[str, int]() + max_items_per_seq_by_modality = dict[str, int]() + + max_tokens_by_modality = mm_registry \ + .get_max_tokens_per_item_by_nonzero_modality(model_config) + + for modality, max_tokens in max_tokens_by_modality.items(): + ( + max_items_per_prompt, + max_items_per_req, + ) = self.get_max_items(modality, max_tokens) + + max_items_per_prompt_by_modality[modality] = max_items_per_prompt + max_items_per_seq_by_modality[modality] = max_items_per_req + + self.max_items_per_prompt_by_modality = max_items_per_prompt_by_modality + self.max_items_per_seq_by_modality = max_items_per_seq_by_modality + + def get_modality_with_max_tokens_per_seq(self) -> tuple[str, int]: + max_tokens_per_seq_by_modality = self.max_items_per_seq_by_modality + modality, max_tokens = max(max_tokens_per_seq_by_modality.items(), + key=lambda item: item[1]) + + return modality, max_tokens + + def get_encoder_budget(self) -> int: + return min(self.max_num_encoder_input_tokens, self.encoder_cache_size) + + def get_max_items( + self, + modality: str, + max_tokens_per_mm_item: int, + ) -> tuple[int, int]: + # Check how many items of this modality can be supported by + # the encoder budget. + encoder_budget = self.get_encoder_budget() + + # TODO: handle encoder-decoder models once we support them. + if encoder_budget == 0: + return 0, 0 + + max_encoder_mm_items = encoder_budget // max_tokens_per_mm_item + + # Check how many items of this modality can be supported by + # the decoder budget. + mm_limits = self.mm_registry.get_mm_limits_per_prompt( + self.model_config) + mm_limit = mm_limits[modality] + + max_mm_items_per_prompt = max( + 1, + min(mm_limit, self.max_model_len // max_tokens_per_mm_item), + ) + + # NOTE: We do not consider max_num_batched_tokens on purpose + # because the multimodal embeddings can be generated in advance + # and chunked prefilled. + max_decoder_mm_items = self.max_num_reqs * max_mm_items_per_prompt + + max_mm_items_per_req = max( + 1, + min(max_encoder_mm_items, max_decoder_mm_items), + ) + + return max_mm_items_per_prompt, max_mm_items_per_req + + def sanity_check_mm_encoder_outputs( mm_embeddings: MultiModalEmbeddings, expected_num_items: int, From aa0b64869e77314153ed10e461facf841936060e Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 2 Aug 2025 07:17:22 +0000 Subject: [PATCH 021/130] Remove whitespace Signed-off-by: DarkLight1337 --- vllm/v1/worker/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 71322c7cca5a..de7a7b1997b9 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -28,7 +28,6 @@ def __init__( max_model_len: int, max_num_reqs: int, ) -> None: - super().__init__() self.model_config = model_config From ac1e1c15e4c3833fc14bab526e5bcf0bd56adffb Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 2 Aug 2025 07:18:10 +0000 Subject: [PATCH 022/130] Eager init Signed-off-by: DarkLight1337 --- vllm/v1/worker/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index de7a7b1997b9..a86e34623977 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -62,6 +62,8 @@ def __init__( self.max_items_per_prompt_by_modality = max_items_per_prompt_by_modality self.max_items_per_seq_by_modality = max_items_per_seq_by_modality + self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config) + def get_modality_with_max_tokens_per_seq(self) -> tuple[str, int]: max_tokens_per_seq_by_modality = self.max_items_per_seq_by_modality modality, max_tokens = max(max_tokens_per_seq_by_modality.items(), @@ -89,9 +91,7 @@ def get_max_items( # Check how many items of this modality can be supported by # the decoder budget. - mm_limits = self.mm_registry.get_mm_limits_per_prompt( - self.model_config) - mm_limit = mm_limits[modality] + mm_limit = self.mm_limits[modality] max_mm_items_per_prompt = max( 1, From 487e3124297a96aa65c21c0b73dbe293bfcee7a5 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 2 Aug 2025 07:19:38 +0000 Subject: [PATCH 023/130] Fix Signed-off-by: DarkLight1337 --- vllm/v1/worker/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index a86e34623977..2f57bb341083 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -44,6 +44,8 @@ def __init__( self.max_model_len = max_model_len self.max_num_reqs = max_num_reqs + self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config) + max_items_per_prompt_by_modality = dict[str, int]() max_items_per_seq_by_modality = dict[str, int]() @@ -62,8 +64,6 @@ def __init__( self.max_items_per_prompt_by_modality = max_items_per_prompt_by_modality self.max_items_per_seq_by_modality = max_items_per_seq_by_modality - self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config) - def get_modality_with_max_tokens_per_seq(self) -> tuple[str, int]: max_tokens_per_seq_by_modality = self.max_items_per_seq_by_modality modality, max_tokens = max(max_tokens_per_seq_by_modality.items(), From a574715938ba82677d3e9650d9f34acd7377b12d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 2 Aug 2025 07:26:42 +0000 Subject: [PATCH 024/130] Don't use cache for dummy data Signed-off-by: DarkLight1337 --- vllm/multimodal/registry.py | 10 ++++++++-- vllm/v1/worker/gpu_model_runner.py | 5 +++-- vllm/v1/worker/tpu_model_runner.py | 5 +++-- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 5f5b620e0cf7..84e32f431eb3 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -260,13 +260,16 @@ def get_decoder_dummy_data( model_config: "ModelConfig", seq_len: int, mm_counts: Optional[Mapping[str, int]] = None, + *, + disable_cache: bool = False, ) -> DummyDecoderData: """ Create dummy data for profiling the memory usage of a model. The model is identified by ``model_config``. """ - processor = self.create_processor(model_config, disable_cache=False) + processor = self.create_processor(model_config, + disable_cache=disable_cache) profiler = MultiModalProfiler(processor) dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts) @@ -284,13 +287,16 @@ def get_encoder_dummy_data( model_config: "ModelConfig", seq_len: int, mm_counts: Optional[Mapping[str, int]] = None, + *, + disable_cache: bool = False, ) -> DummyEncoderData: """ Create dummy data for profiling the memory usage of a model. The model is identified by ``model_config``. """ - processor = self.create_processor(model_config, disable_cache=False) + processor = self.create_processor(model_config, + disable_cache=disable_cache) profiler = MultiModalProfiler(processor) dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index dc7ed04bd109..bb56be005558 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2171,12 +2171,13 @@ def _get_mm_dummy_batch( ) -> BatchedTensorInputs: """Dummy data for profiling and precompiling multimodal models.""" # Result in the maximum GPU consumption of HF processor - dummy_request_data = self.mm_registry.get_decoder_dummy_data( + dummy_decoder_data = self.mm_registry.get_decoder_dummy_data( model_config=self.model_config, seq_len=self.max_num_tokens, mm_counts={modality: processor_batch_size}, + disable_cache=True, ) - dummy_mm_data = dummy_request_data.multi_modal_data + dummy_mm_data = dummy_decoder_data.multi_modal_data # Result in the maximum GPU consumption of the model dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 977ed33258b7..072c48d2a7c6 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1807,12 +1807,13 @@ def _get_mm_dummy_batch( ) -> BatchedTensorInputs: """Dummy data for profiling and precompiling multimodal models.""" # Result in the maximum GPU consumption of HF processor - dummy_request_data = self.mm_registry.get_decoder_dummy_data( + dummy_decoder_data = self.mm_registry.get_decoder_dummy_data( model_config=self.model_config, seq_len=self.max_num_tokens, mm_counts={modality: processor_batch_size}, + disable_cache=True, ) - dummy_mm_data = dummy_request_data.multi_modal_data + dummy_mm_data = dummy_decoder_data.multi_modal_data # Result in the maximum GPU consumption of the model dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0) From 58f9123ddd45d75354497c14d8b5d6dc0396d2ce Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 2 Aug 2025 07:32:36 +0000 Subject: [PATCH 025/130] Split processor and model data Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 32 +++++++++++++++++++++++------- vllm/v1/worker/tpu_model_runner.py | 32 ++++++++++++++++++++++-------- 2 files changed, 49 insertions(+), 15 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index bb56be005558..4b8a5867f967 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -38,6 +38,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargs, PlaceholderRange) +from vllm.multimodal.profiling import DummyDecoderData from vllm.multimodal.utils import group_mm_inputs_by_modality from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingType @@ -610,7 +611,13 @@ def _dummy_mm_kwargs(self, num_seqs: int) -> BatchedTensorInputs: dummy_modality, _ = mm_budget \ .get_modality_with_max_tokens_per_seq() - return self._get_mm_dummy_batch(dummy_modality, 1, num_seqs) + dummy_mm_data = self._get_mm_decoder_dummy_data(dummy_modality, 1) + + return self._get_mm_decoder_dummy_batch( + dummy_modality, + dummy_mm_data, + num_seqs, + ) return {} @@ -2163,20 +2170,27 @@ def rand_input_ids() -> torch.Tensor: yield input_ids.fill_(0) - def _get_mm_dummy_batch( + def _get_mm_decoder_dummy_data( self, modality: str, processor_batch_size: int, - model_batch_size: int, - ) -> BatchedTensorInputs: - """Dummy data for profiling and precompiling multimodal models.""" + ) -> DummyDecoderData: + """Dummy data for profiling and precompiling multimodal processor.""" # Result in the maximum GPU consumption of HF processor - dummy_decoder_data = self.mm_registry.get_decoder_dummy_data( + return self.mm_registry.get_decoder_dummy_data( model_config=self.model_config, seq_len=self.max_num_tokens, mm_counts={modality: processor_batch_size}, disable_cache=True, ) + + def _get_mm_decoder_dummy_batch( + self, + modality: str, + dummy_decoder_data: DummyDecoderData, + model_batch_size: int, + ) -> BatchedTensorInputs: + """Dummy data for profiling and precompiling multimodal models.""" dummy_mm_data = dummy_decoder_data.multi_modal_data # Result in the maximum GPU consumption of the model @@ -2490,9 +2504,13 @@ def profile_run(self) -> None: ) # Create dummy batch of multimodal inputs. - batched_dummy_mm_inputs = self._get_mm_dummy_batch( + dummy_mm_data = self._get_mm_decoder_dummy_data( dummy_modality, max_mm_items_per_prompt, + ) + batched_dummy_mm_inputs = self._get_mm_decoder_dummy_batch( + dummy_modality, + dummy_mm_data, max_mm_items_per_req, ) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 072c48d2a7c6..a7142debba91 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -33,6 +33,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargs, PlaceholderRange) +from vllm.multimodal.profiling import DummyDecoderData from vllm.multimodal.utils import group_mm_inputs_by_modality from vllm.sequence import IntermediateTensors from vllm.tasks import GenerationTask, PoolingTask, SupportedTask @@ -1314,6 +1315,10 @@ def _precompile_mm_encoder(self) -> None: for mode, max_items_per_seq in max_items_per_seq_by_modality.items(): max_items_per_prompt = max_items_per_prompt_by_modality[mode] + dummy_mm_data = self._get_mm_decoder_dummy_data( + mode, + max_items_per_prompt, + ) logger.info( "Compiling Multimodal %s Encoder with different input" @@ -1322,9 +1327,9 @@ def _precompile_mm_encoder(self) -> None: # No padding for MM encoder just yet. for num_items in range(1, max_items_per_seq + 1): logger.info(" -- mode: %s items: %d", mode, num_items) - batched_dummy_mm_inputs = self._get_mm_dummy_batch( + batched_dummy_mm_inputs = self._get_mm_decoder_dummy_batch( mode, - max_items_per_prompt, + dummy_mm_data, num_items, ) # Run multimodal encoder. @@ -1554,9 +1559,13 @@ def profile_run( ) # Create dummy batch of multimodal inputs. - batched_dummy_mm_inputs = self._get_mm_dummy_batch( + dummy_mm_data = self._get_mm_decoder_dummy_data( dummy_modality, max_mm_items_per_prompt, + ) + batched_dummy_mm_inputs = self._get_mm_decoder_dummy_batch( + dummy_modality, + dummy_mm_data, max_mm_items_per_req, ) @@ -1799,20 +1808,27 @@ def prepare_structured_decoding_input( self.grammar_bitmask_cpu[:num_reqs].to(logits.device), \ self.structured_decode_arange.to(logits.device) - def _get_mm_dummy_batch( + def _get_mm_decoder_dummy_data( self, modality: str, processor_batch_size: int, - model_batch_size: int, - ) -> BatchedTensorInputs: - """Dummy data for profiling and precompiling multimodal models.""" + ) -> DummyDecoderData: + """Dummy data for profiling and precompiling multimodal processor.""" # Result in the maximum GPU consumption of HF processor - dummy_decoder_data = self.mm_registry.get_decoder_dummy_data( + return self.mm_registry.get_decoder_dummy_data( model_config=self.model_config, seq_len=self.max_num_tokens, mm_counts={modality: processor_batch_size}, disable_cache=True, ) + + def _get_mm_decoder_dummy_batch( + self, + modality: str, + dummy_decoder_data: DummyDecoderData, + model_batch_size: int, + ) -> BatchedTensorInputs: + """Dummy data for profiling and precompiling multimodal models.""" dummy_mm_data = dummy_decoder_data.multi_modal_data # Result in the maximum GPU consumption of the model From 0baf55a57ed5b64f4c980b2349fc246fd062a099 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 2 Aug 2025 13:42:31 +0000 Subject: [PATCH 026/130] Optimize Signed-off-by: DarkLight1337 --- vllm/config.py | 7 +++++++ vllm/inputs/registry.py | 15 +++++++-------- vllm/v1/worker/gpu_model_runner.py | 22 +++++++++++++++------- vllm/v1/worker/tpu_model_runner.py | 22 +++++++++++++++------- 4 files changed, 44 insertions(+), 22 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 1db39c709718..e8676a28d2d7 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3357,6 +3357,13 @@ class MultiModalConfig: Enable fully interleaved support for multimodal prompts. """ + @property + def is_mm_processing_gpu(self) -> bool: + if not self.mm_processor_kwargs: + return False + + return self.mm_processor_kwargs.get("device", "cpu") != "cpu" + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 8a47425c22de..b997aed64ca4 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -144,9 +144,9 @@ def get_hf_processor( def _postprocess_output( self, output: JSONTree, - *, - is_gpu: bool, ) -> JSONTree: + mm_config = self.model_config.get_multimodal_config() + is_mm_processing_gpu = mm_config.is_mm_processing_gpu def _postprocess_one(x: object): if isinstance(x, torch.Tensor): @@ -159,15 +159,15 @@ def _postprocess_one(x: object): # CPU tensors. # The dtype of model config is usually lower precision # so we call this last to transfer less data to CPU - if is_gpu: + if is_mm_processing_gpu: x = x.to(device="cpu", non_blocking=True) return x output = json_map_leaves(_postprocess_one, output) - # GPU -> CPU requires explicit synchronization - if is_gpu: + # Async GPU -> CPU requires explicit synchronization + if is_mm_processing_gpu: from vllm.platforms import current_platform synchronize = current_platform.synchronize if synchronize is not None: @@ -196,7 +196,6 @@ def call_hf_processor( requires_kw_only=False, allow_var_kwargs=True, ) - is_gpu = allowed_kwargs.get("device", "cpu") != "cpu" try: output = hf_processor(**data, @@ -209,7 +208,7 @@ def call_hf_processor( raise ValueError(msg) from exc if isinstance(output, BatchFeature): - output_ = self._postprocess_output(output.data, is_gpu=is_gpu) + output_ = self._postprocess_output(output.data) return BatchFeature(output_) logger.warning_once( @@ -219,7 +218,7 @@ def call_hf_processor( type(hf_processor).__name__, ) - return self._postprocess_output(output, is_gpu=is_gpu) + return self._postprocess_output(output) class DummyData(NamedTuple): diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 6020bf9c9661..a6eeaa382ffd 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2167,22 +2167,30 @@ def rand_input_ids() -> torch.Tensor: def _get_mm_decoder_dummy_data( self, modality: str, - processor_batch_size: int, + processor_max_batch_size: int, ) -> DummyDecoderData: """Dummy data for profiling and precompiling multimodal processor.""" - # Result in the maximum GPU consumption of HF processor + model_config = self.model_config + if model_config.get_multimodal_config().is_mm_processing_gpu: + # Result in the maximum GPU consumption of HF processor + mm_counts = {modality: processor_max_batch_size} + disable_cache = True + else: + mm_counts = {modality: 1} + disable_cache = False + return self.mm_registry.get_decoder_dummy_data( - model_config=self.model_config, + model_config=model_config, seq_len=self.max_num_tokens, - mm_counts={modality: processor_batch_size}, - disable_cache=True, + mm_counts=mm_counts, + disable_cache=disable_cache, ) def _get_mm_decoder_dummy_batch( self, modality: str, dummy_decoder_data: DummyDecoderData, - model_batch_size: int, + model_max_batch_size: int, ) -> BatchedTensorInputs: """Dummy data for profiling and precompiling multimodal models.""" dummy_mm_data = dummy_decoder_data.multi_modal_data @@ -2192,7 +2200,7 @@ def _get_mm_decoder_dummy_batch( dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item]) batched_dummy_mm_inputs = MultiModalKwargs.batch([dummy_mm_kwargs] * - model_batch_size) + model_max_batch_size) return MultiModalKwargs.as_kwargs( batched_dummy_mm_inputs, device=self.device, diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index a7142debba91..cede18f7f82d 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1811,22 +1811,30 @@ def prepare_structured_decoding_input( def _get_mm_decoder_dummy_data( self, modality: str, - processor_batch_size: int, + processor_max_batch_size: int, ) -> DummyDecoderData: """Dummy data for profiling and precompiling multimodal processor.""" - # Result in the maximum GPU consumption of HF processor + model_config = self.model_config + if model_config.get_multimodal_config().is_mm_processing_gpu: + # Result in the maximum GPU consumption of HF processor + mm_counts = {modality: processor_max_batch_size} + disable_cache = True + else: + mm_counts = {modality: 1} + disable_cache = False + return self.mm_registry.get_decoder_dummy_data( - model_config=self.model_config, + model_config=model_config, seq_len=self.max_num_tokens, - mm_counts={modality: processor_batch_size}, - disable_cache=True, + mm_counts=mm_counts, + disable_cache=disable_cache, ) def _get_mm_decoder_dummy_batch( self, modality: str, dummy_decoder_data: DummyDecoderData, - model_batch_size: int, + model_max_batch_size: int, ) -> BatchedTensorInputs: """Dummy data for profiling and precompiling multimodal models.""" dummy_mm_data = dummy_decoder_data.multi_modal_data @@ -1836,7 +1844,7 @@ def _get_mm_decoder_dummy_batch( dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item]) batched_dummy_mm_inputs = MultiModalKwargs.batch([dummy_mm_kwargs] * - model_batch_size) + model_max_batch_size) return MultiModalKwargs.as_kwargs( batched_dummy_mm_inputs, device=self.device, From 423a8aa10423f037392c6e3e5b5599173cdd9841 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 2 Aug 2025 13:44:24 +0000 Subject: [PATCH 027/130] Fix Signed-off-by: DarkLight1337 --- vllm/v1/worker/utils.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 2f57bb341083..de3573ea4ef4 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -77,8 +77,11 @@ def get_encoder_budget(self) -> int: def get_max_items( self, modality: str, - max_tokens_per_mm_item: int, + max_tokens_per_item: int, ) -> tuple[int, int]: + if max_tokens_per_item == 0: + return 0, 0 + # Check how many items of this modality can be supported by # the encoder budget. encoder_budget = self.get_encoder_budget() @@ -87,28 +90,28 @@ def get_max_items( if encoder_budget == 0: return 0, 0 - max_encoder_mm_items = encoder_budget // max_tokens_per_mm_item + max_encoder_items = encoder_budget // max_tokens_per_item # Check how many items of this modality can be supported by # the decoder budget. mm_limit = self.mm_limits[modality] - max_mm_items_per_prompt = max( + max_items_per_prompt = max( 1, - min(mm_limit, self.max_model_len // max_tokens_per_mm_item), + min(mm_limit, self.max_model_len // max_tokens_per_item), ) # NOTE: We do not consider max_num_batched_tokens on purpose # because the multimodal embeddings can be generated in advance # and chunked prefilled. - max_decoder_mm_items = self.max_num_reqs * max_mm_items_per_prompt + max_decoder_mm_items = self.max_num_reqs * max_items_per_prompt - max_mm_items_per_req = max( + max_items_per_req = max( 1, - min(max_encoder_mm_items, max_decoder_mm_items), + min(max_encoder_items, max_decoder_mm_items), ) - return max_mm_items_per_prompt, max_mm_items_per_req + return max_items_per_prompt, max_items_per_req def sanity_check_mm_encoder_outputs( From 4eb05299c06b9fa0223e5e87861341ba6d01e17d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 2 Aug 2025 13:50:56 +0000 Subject: [PATCH 028/130] Handle disabled chunked prefill Signed-off-by: DarkLight1337 --- vllm/v1/worker/utils.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index de3573ea4ef4..d0260cb6a277 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -31,6 +31,7 @@ def __init__( super().__init__() self.model_config = model_config + self.scheduler_config = scheduler_config self.mm_registry = mm_registry encoder_compute_budget, encoder_cache_size = compute_encoder_budget( @@ -101,14 +102,20 @@ def get_max_items( min(mm_limit, self.max_model_len // max_tokens_per_item), ) - # NOTE: We do not consider max_num_batched_tokens on purpose - # because the multimodal embeddings can be generated in advance - # and chunked prefilled. - max_decoder_mm_items = self.max_num_reqs * max_items_per_prompt + scheduler_config = self.scheduler_config + max_num_reqs = self.max_num_reqs + + if not scheduler_config.enable_chunked_prefill: + max_num_reqs = min( + max_num_reqs, + scheduler_config.max_num_batched_tokens // max_tokens_per_item, + ) + + max_decoder_items = max_num_reqs * max_items_per_prompt max_items_per_req = max( 1, - min(max_encoder_items, max_decoder_mm_items), + min(max_encoder_items, max_decoder_items), ) return max_items_per_prompt, max_items_per_req From b452419fa56d4cb3f79e1c809a7e0b5c228802de Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 2 Aug 2025 13:52:54 +0000 Subject: [PATCH 029/130] Fix naming Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 12 ++++++------ vllm/v1/worker/tpu_model_runner.py | 12 ++++++------ vllm/v1/worker/utils.py | 16 ++++++++-------- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a6eeaa382ffd..6b3e8ede33c6 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -603,7 +603,7 @@ def _dummy_mm_kwargs(self, num_seqs: int) -> BatchedTensorInputs: assert mm_budget is not None dummy_modality, _ = mm_budget \ - .get_modality_with_max_tokens_per_seq() + .get_modality_with_max_tokens_per_iter() dummy_mm_data = self._get_mm_decoder_dummy_data(dummy_modality, 1) @@ -2490,10 +2490,10 @@ def profile_run(self) -> None: ( dummy_modality, max_tokens, - ) = mm_budget.get_modality_with_max_tokens_per_seq() + ) = mm_budget.get_modality_with_max_tokens_per_iter() ( max_mm_items_per_prompt, - max_mm_items_per_req, + max_mm_items_per_iter, ) = mm_budget.get_max_items(dummy_modality, max_tokens) logger.info( @@ -2501,7 +2501,7 @@ def profile_run(self) -> None: "%s tokens, and profiled with %s %s items of the maximum " "feature size.", encoder_budget, - max_mm_items_per_req, + max_mm_items_per_iter, dummy_modality, ) @@ -2513,7 +2513,7 @@ def profile_run(self) -> None: batched_dummy_mm_inputs = self._get_mm_decoder_dummy_batch( dummy_modality, dummy_mm_data, - max_mm_items_per_req, + max_mm_items_per_iter, ) # Run multimodal encoder. @@ -2522,7 +2522,7 @@ def profile_run(self) -> None: sanity_check_mm_encoder_outputs( dummy_encoder_outputs, - expected_num_items=max_mm_items_per_req, + expected_num_items=max_mm_items_per_iter, ) # Cache the dummy encoder outputs. diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index cede18f7f82d..e90d89aa0b81 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1310,7 +1310,7 @@ def _precompile_mm_encoder(self) -> None: mm_budget = self.mm_budget assert mm_budget is not None - max_items_per_seq_by_modality = mm_budget.max_items_per_seq_by_modality + max_items_per_seq_by_modality = mm_budget.max_items_per_iter_by_modality max_items_per_prompt_by_modality = mm_budget.max_items_per_prompt_by_modality # noqa: E501 for mode, max_items_per_seq in max_items_per_seq_by_modality.items(): @@ -1543,10 +1543,10 @@ def profile_run( ( dummy_modality, max_tokens, - ) = mm_budget.get_modality_with_max_tokens_per_seq() + ) = mm_budget.get_modality_with_max_tokens_per_iter() ( max_mm_items_per_prompt, - max_mm_items_per_req, + max_mm_items_per_iter, ) = mm_budget.get_max_items(dummy_modality, max_tokens) logger.info( @@ -1554,7 +1554,7 @@ def profile_run( "%s tokens, and profiled with %s %s items of the maximum " "feature size.", encoder_budget, - max_mm_items_per_req, + max_mm_items_per_iter, dummy_modality, ) @@ -1566,7 +1566,7 @@ def profile_run( batched_dummy_mm_inputs = self._get_mm_decoder_dummy_batch( dummy_modality, dummy_mm_data, - max_mm_items_per_req, + max_mm_items_per_iter, ) # Run multimodal encoder. @@ -1585,7 +1585,7 @@ def profile_run( sanity_check_mm_encoder_outputs( dummy_encoder_outputs, - expected_num_items=max_mm_items_per_req, + expected_num_items=max_mm_items_per_iter, ) # Cache the dummy encoder outputs. diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index d0260cb6a277..179e350c42bb 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -56,18 +56,18 @@ def __init__( for modality, max_tokens in max_tokens_by_modality.items(): ( max_items_per_prompt, - max_items_per_req, + max_items_per_iter, ) = self.get_max_items(modality, max_tokens) max_items_per_prompt_by_modality[modality] = max_items_per_prompt - max_items_per_seq_by_modality[modality] = max_items_per_req + max_items_per_seq_by_modality[modality] = max_items_per_iter self.max_items_per_prompt_by_modality = max_items_per_prompt_by_modality - self.max_items_per_seq_by_modality = max_items_per_seq_by_modality + self.max_items_per_iter_by_modality = max_items_per_seq_by_modality - def get_modality_with_max_tokens_per_seq(self) -> tuple[str, int]: - max_tokens_per_seq_by_modality = self.max_items_per_seq_by_modality - modality, max_tokens = max(max_tokens_per_seq_by_modality.items(), + def get_modality_with_max_tokens_per_iter(self) -> tuple[str, int]: + max_tokens_per_iter_by_modality = self.max_items_per_iter_by_modality + modality, max_tokens = max(max_tokens_per_iter_by_modality.items(), key=lambda item: item[1]) return modality, max_tokens @@ -113,12 +113,12 @@ def get_max_items( max_decoder_items = max_num_reqs * max_items_per_prompt - max_items_per_req = max( + max_items_per_iter = max( 1, min(max_encoder_items, max_decoder_items), ) - return max_items_per_prompt, max_items_per_req + return max_items_per_prompt, max_items_per_iter def sanity_check_mm_encoder_outputs( From b8303dbeebd43d7bf1b3ed1233da4cdabdde9507 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 2 Aug 2025 14:00:40 +0000 Subject: [PATCH 030/130] Rename Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 20 ++++++++++---------- vllm/v1/worker/tpu_model_runner.py | 20 ++++++++++---------- vllm/v1/worker/utils.py | 22 +++++++++++----------- 3 files changed, 31 insertions(+), 31 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 6b3e8ede33c6..011c3b8cbe44 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -603,7 +603,7 @@ def _dummy_mm_kwargs(self, num_seqs: int) -> BatchedTensorInputs: assert mm_budget is not None dummy_modality, _ = mm_budget \ - .get_modality_with_max_tokens_per_iter() + .get_modality_with_max_tokens_per_batch() dummy_mm_data = self._get_mm_decoder_dummy_data(dummy_modality, 1) @@ -2167,13 +2167,13 @@ def rand_input_ids() -> torch.Tensor: def _get_mm_decoder_dummy_data( self, modality: str, - processor_max_batch_size: int, + max_items_per_prompt: int, ) -> DummyDecoderData: """Dummy data for profiling and precompiling multimodal processor.""" model_config = self.model_config if model_config.get_multimodal_config().is_mm_processing_gpu: # Result in the maximum GPU consumption of HF processor - mm_counts = {modality: processor_max_batch_size} + mm_counts = {modality: max_items_per_prompt} disable_cache = True else: mm_counts = {modality: 1} @@ -2190,7 +2190,7 @@ def _get_mm_decoder_dummy_batch( self, modality: str, dummy_decoder_data: DummyDecoderData, - model_max_batch_size: int, + max_items_per_batch: int, ) -> BatchedTensorInputs: """Dummy data for profiling and precompiling multimodal models.""" dummy_mm_data = dummy_decoder_data.multi_modal_data @@ -2200,7 +2200,7 @@ def _get_mm_decoder_dummy_batch( dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item]) batched_dummy_mm_inputs = MultiModalKwargs.batch([dummy_mm_kwargs] * - model_max_batch_size) + max_items_per_batch) return MultiModalKwargs.as_kwargs( batched_dummy_mm_inputs, device=self.device, @@ -2490,10 +2490,10 @@ def profile_run(self) -> None: ( dummy_modality, max_tokens, - ) = mm_budget.get_modality_with_max_tokens_per_iter() + ) = mm_budget.get_modality_with_max_tokens_per_batch() ( max_mm_items_per_prompt, - max_mm_items_per_iter, + max_mm_items_per_batch, ) = mm_budget.get_max_items(dummy_modality, max_tokens) logger.info( @@ -2501,7 +2501,7 @@ def profile_run(self) -> None: "%s tokens, and profiled with %s %s items of the maximum " "feature size.", encoder_budget, - max_mm_items_per_iter, + max_mm_items_per_batch, dummy_modality, ) @@ -2513,7 +2513,7 @@ def profile_run(self) -> None: batched_dummy_mm_inputs = self._get_mm_decoder_dummy_batch( dummy_modality, dummy_mm_data, - max_mm_items_per_iter, + max_mm_items_per_batch, ) # Run multimodal encoder. @@ -2522,7 +2522,7 @@ def profile_run(self) -> None: sanity_check_mm_encoder_outputs( dummy_encoder_outputs, - expected_num_items=max_mm_items_per_iter, + expected_num_items=max_mm_items_per_batch, ) # Cache the dummy encoder outputs. diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index e90d89aa0b81..a45b5554b738 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1310,7 +1310,7 @@ def _precompile_mm_encoder(self) -> None: mm_budget = self.mm_budget assert mm_budget is not None - max_items_per_seq_by_modality = mm_budget.max_items_per_iter_by_modality + max_items_per_seq_by_modality = mm_budget.max_items_per_batch_by_modality # noqa: E501 max_items_per_prompt_by_modality = mm_budget.max_items_per_prompt_by_modality # noqa: E501 for mode, max_items_per_seq in max_items_per_seq_by_modality.items(): @@ -1543,10 +1543,10 @@ def profile_run( ( dummy_modality, max_tokens, - ) = mm_budget.get_modality_with_max_tokens_per_iter() + ) = mm_budget.get_modality_with_max_tokens_per_batch() ( max_mm_items_per_prompt, - max_mm_items_per_iter, + max_mm_items_per_batch, ) = mm_budget.get_max_items(dummy_modality, max_tokens) logger.info( @@ -1554,7 +1554,7 @@ def profile_run( "%s tokens, and profiled with %s %s items of the maximum " "feature size.", encoder_budget, - max_mm_items_per_iter, + max_mm_items_per_batch, dummy_modality, ) @@ -1566,7 +1566,7 @@ def profile_run( batched_dummy_mm_inputs = self._get_mm_decoder_dummy_batch( dummy_modality, dummy_mm_data, - max_mm_items_per_iter, + max_mm_items_per_batch, ) # Run multimodal encoder. @@ -1585,7 +1585,7 @@ def profile_run( sanity_check_mm_encoder_outputs( dummy_encoder_outputs, - expected_num_items=max_mm_items_per_iter, + expected_num_items=max_mm_items_per_batch, ) # Cache the dummy encoder outputs. @@ -1811,13 +1811,13 @@ def prepare_structured_decoding_input( def _get_mm_decoder_dummy_data( self, modality: str, - processor_max_batch_size: int, + max_items_per_prompt: int, ) -> DummyDecoderData: """Dummy data for profiling and precompiling multimodal processor.""" model_config = self.model_config if model_config.get_multimodal_config().is_mm_processing_gpu: # Result in the maximum GPU consumption of HF processor - mm_counts = {modality: processor_max_batch_size} + mm_counts = {modality: max_items_per_prompt} disable_cache = True else: mm_counts = {modality: 1} @@ -1834,7 +1834,7 @@ def _get_mm_decoder_dummy_batch( self, modality: str, dummy_decoder_data: DummyDecoderData, - model_max_batch_size: int, + max_items_per_batch: int, ) -> BatchedTensorInputs: """Dummy data for profiling and precompiling multimodal models.""" dummy_mm_data = dummy_decoder_data.multi_modal_data @@ -1844,7 +1844,7 @@ def _get_mm_decoder_dummy_batch( dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item]) batched_dummy_mm_inputs = MultiModalKwargs.batch([dummy_mm_kwargs] * - model_max_batch_size) + max_items_per_batch) return MultiModalKwargs.as_kwargs( batched_dummy_mm_inputs, device=self.device, diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 179e350c42bb..7a64e14caaae 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -56,18 +56,18 @@ def __init__( for modality, max_tokens in max_tokens_by_modality.items(): ( max_items_per_prompt, - max_items_per_iter, + max_items_per_batch, ) = self.get_max_items(modality, max_tokens) max_items_per_prompt_by_modality[modality] = max_items_per_prompt - max_items_per_seq_by_modality[modality] = max_items_per_iter + max_items_per_seq_by_modality[modality] = max_items_per_batch self.max_items_per_prompt_by_modality = max_items_per_prompt_by_modality - self.max_items_per_iter_by_modality = max_items_per_seq_by_modality + self.max_items_per_batch_by_modality = max_items_per_seq_by_modality - def get_modality_with_max_tokens_per_iter(self) -> tuple[str, int]: - max_tokens_per_iter_by_modality = self.max_items_per_iter_by_modality - modality, max_tokens = max(max_tokens_per_iter_by_modality.items(), + def get_modality_with_max_tokens_per_batch(self) -> tuple[str, int]: + max_tokens_per_batch_by_modality = self.max_items_per_batch_by_modality + modality, max_tokens = max(max_tokens_per_batch_by_modality.items(), key=lambda item: item[1]) return modality, max_tokens @@ -91,7 +91,7 @@ def get_max_items( if encoder_budget == 0: return 0, 0 - max_encoder_items = encoder_budget // max_tokens_per_item + max_encoder_items_per_batch = encoder_budget // max_tokens_per_item # Check how many items of this modality can be supported by # the decoder budget. @@ -111,14 +111,14 @@ def get_max_items( scheduler_config.max_num_batched_tokens // max_tokens_per_item, ) - max_decoder_items = max_num_reqs * max_items_per_prompt + max_decoder_items_per_batch = max_num_reqs * max_items_per_prompt - max_items_per_iter = max( + max_items_per_batch = max( 1, - min(max_encoder_items, max_decoder_items), + min(max_encoder_items_per_batch, max_decoder_items_per_batch), ) - return max_items_per_prompt, max_items_per_iter + return max_items_per_prompt, max_items_per_batch def sanity_check_mm_encoder_outputs( From 61b2d4a9fe41f6aebfcbd856fc61a855c2ddabca Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 2 Aug 2025 14:58:42 +0000 Subject: [PATCH 031/130] Add guard Signed-off-by: DarkLight1337 --- vllm/v1/worker/tpu_model_runner.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index a45b5554b738..d157e23dc434 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1304,6 +1304,9 @@ def _set_active_loras(self, prompt_lora_mapping, token_lora_mapping, xm.mark_step() # Captures metadata updates def _precompile_mm_encoder(self) -> None: + if not self.is_multimodal_model: + return + # Pre-compile MM encoder for all supported data modalities. hf_config = self.vllm_config.model_config.hf_config From 3132a41639306105722ac260c31fd33d293906fd Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sun, 3 Aug 2025 02:59:59 +0000 Subject: [PATCH 032/130] Remove unnecessary register Signed-off-by: DarkLight1337 --- tests/models/test_initialization.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 4c7da24fca32..75831f2eef69 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -33,11 +33,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, model_info.check_available_online(on_fail="skip") model_info.check_transformers_version(on_fail="skip") - if model_arch in ("Llama4ForCausalLM", "EagleLlama4ForCausalLM"): - from vllm.model_executor.models.llama4 import Llama4ForCausalLM - from vllm.model_executor.models.registry import ModelRegistry - ModelRegistry.register_model("Llama4ForCausalLM", Llama4ForCausalLM) - # Avoid OOM and reduce initialization time by only using 1 layer def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig: hf_config.update(model_info.hf_overrides) From e743c47fde82ffce72d67542c6f85b2446751436 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sun, 3 Aug 2025 03:11:48 +0000 Subject: [PATCH 033/130] Fix Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 4 ++-- vllm/v1/worker/tpu_model_runner.py | 2 +- vllm/v1/worker/utils.py | 13 +++++++------ 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 011c3b8cbe44..31e970c4636a 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -603,7 +603,7 @@ def _dummy_mm_kwargs(self, num_seqs: int) -> BatchedTensorInputs: assert mm_budget is not None dummy_modality, _ = mm_budget \ - .get_modality_with_max_tokens_per_batch() + .get_modality_with_max_tokens() dummy_mm_data = self._get_mm_decoder_dummy_data(dummy_modality, 1) @@ -2490,7 +2490,7 @@ def profile_run(self) -> None: ( dummy_modality, max_tokens, - ) = mm_budget.get_modality_with_max_tokens_per_batch() + ) = mm_budget.get_modality_with_max_tokens() ( max_mm_items_per_prompt, max_mm_items_per_batch, diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index d157e23dc434..09885b4244ec 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1546,7 +1546,7 @@ def profile_run( ( dummy_modality, max_tokens, - ) = mm_budget.get_modality_with_max_tokens_per_batch() + ) = mm_budget.get_modality_with_max_tokens() ( max_mm_items_per_prompt, max_mm_items_per_batch, diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 7a64e14caaae..6761b3c5e41d 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -48,7 +48,7 @@ def __init__( self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config) max_items_per_prompt_by_modality = dict[str, int]() - max_items_per_seq_by_modality = dict[str, int]() + max_items_per_batch_by_modality = dict[str, int]() max_tokens_by_modality = mm_registry \ .get_max_tokens_per_item_by_nonzero_modality(model_config) @@ -60,14 +60,15 @@ def __init__( ) = self.get_max_items(modality, max_tokens) max_items_per_prompt_by_modality[modality] = max_items_per_prompt - max_items_per_seq_by_modality[modality] = max_items_per_batch + max_items_per_batch_by_modality[modality] = max_items_per_batch + self.max_tokens_by_modality = max_tokens_by_modality self.max_items_per_prompt_by_modality = max_items_per_prompt_by_modality - self.max_items_per_batch_by_modality = max_items_per_seq_by_modality + self.max_items_per_batch_by_modality = max_items_per_batch_by_modality - def get_modality_with_max_tokens_per_batch(self) -> tuple[str, int]: - max_tokens_per_batch_by_modality = self.max_items_per_batch_by_modality - modality, max_tokens = max(max_tokens_per_batch_by_modality.items(), + def get_modality_with_max_tokens(self) -> tuple[str, int]: + max_tokens_by_modality = self.max_tokens_by_modality + modality, max_tokens = max(max_tokens_by_modality.items(), key=lambda item: item[1]) return modality, max_tokens From 93af8435b115a306092864639c11d4439deb1573 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 5 Aug 2025 07:00:26 +0000 Subject: [PATCH 034/130] Update Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 2cb15052f9f3..4a9615937fa9 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -602,9 +602,7 @@ def _dummy_mm_kwargs(self, num_seqs: int) -> BatchedTensorInputs: mm_budget = self.mm_budget assert mm_budget is not None - dummy_modality, _ = mm_budget \ - .get_modality_with_max_tokens() - + dummy_modality, _ = mm_budget.get_modality_with_max_tokens() dummy_mm_data = self._get_mm_decoder_dummy_data(dummy_modality, 1) return self._get_mm_decoder_dummy_batch( From 63ff13ea67f764694eb100775ae8b891d2008182 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 5 Aug 2025 07:50:25 +0000 Subject: [PATCH 035/130] Don't disable caching Signed-off-by: DarkLight1337 --- vllm/engine/arg_utils.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 9ff68ba5f9c1..9dd47e6ddee8 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1231,14 +1231,6 @@ def create_engine_config( ) if model_config.is_multimodal_model: - mm_processor_kwargs = model_config.mm_processor_kwargs or {} - if (mm_processor_kwargs.get("device", "cpu") != "cpu" - and not model_config.disable_mm_preprocessor_cache): - logger.info("Multi-modal preprocessor cache is automatically " - "disabled to optimize the performance of " - "GPU-accelerated multi-modal processor.") - model_config.set_disable_mm_preprocessor_cache(True) - dp_supports_mm_preprocessor_cache = (self.data_parallel_size == 1 or data_parallel_external_lb) if (not dp_supports_mm_preprocessor_cache From 7f239079d69cabf16567d1d06f7bcf10557e2485 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 5 Aug 2025 07:52:47 +0000 Subject: [PATCH 036/130] Simplify Signed-off-by: DarkLight1337 --- vllm/config.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 5605bb7cacc7..9491afe120a5 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3388,10 +3388,9 @@ class MultiModalConfig: @property def is_mm_processing_gpu(self) -> bool: - if not self.mm_processor_kwargs: - return False + kwargs = self.mm_processor_kwargs or {} - return self.mm_processor_kwargs.get("device", "cpu") != "cpu" + return kwargs.get("device", "cpu") != "cpu" def compute_hash(self) -> str: """ From d92111419deebf88ebac077b1e86c748e886b7a5 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 5 Aug 2025 07:57:26 +0000 Subject: [PATCH 037/130] Update Signed-off-by: DarkLight1337 --- docs/configuration/optimization.md | 5 ----- 1 file changed, 5 deletions(-) diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index 6ed8b3c6eef5..d1b9845aaff9 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -189,8 +189,3 @@ llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", llm = LLM(model="Qwen/Qwen2-Audio-7B-Instruct", mm_processor_kwargs={"device": "cuda"}) ``` - -!!! note - Multi-modal processor cache is disabled when using GPU multi-modal processing - because GPU operations work better with larger batch size, which happens less - frequently when the cache is enabled. From b3b662b2f4c8eaedc9fb2954672f8344d13391e3 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 5 Aug 2025 08:02:55 +0000 Subject: [PATCH 038/130] Reduce diffs Signed-off-by: DarkLight1337 --- docs/configuration/optimization.md | 43 ++++++++++++++++-------------- vllm/engine/arg_utils.py | 22 +++++++-------- 2 files changed, 34 insertions(+), 31 deletions(-) diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index d1b9845aaff9..0bf5aa56773b 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -129,28 +129,14 @@ Data parallelism replicates the entire model across multiple GPU sets and proces Data parallelism can be combined with the other parallelism strategies and is set by `data_parallel_size=N`. Note that MoE layers will be sharded according to the product of the tensor parallel size and data parallel size. -## Multi-modal Processing - -### Processor Cache - -By default, the multi-modal processor cache is enabled to avoid repeatedly calling Hugging Face processors -on the same multi-modal inputs, which commonly occurs in multi-turn conversations. - -You can adjust the size of the cache via `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB). -The actual memory usage is double of this value because the cache is mirrored across API and engine core processes. - -If you do not benefit much from the cache, you can disable it completely via `disable_mm_preprocessor_cache`: - -```python -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - disable_mm_preprocessor_cache=True) -``` +## Input Processing ### Parallel Processing You can run input processing in parallel via [API server scale-out](../serving/data_parallel_deployment.md#internal-load-balancing). This is useful when input processing (which is run inside the API server) -becomes a bottleneck compared to model execution (which is run inside engine core). +becomes a bottleneck compared to model execution (which is run inside engine core) +and you have excess CPU capacity. ```console # Run 4 API processes and 1 engine core process @@ -164,12 +150,12 @@ vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4 -dp 2 API server scale-out is only available for online inference. !!! note - Multi-modal processor cache is disabled when API server scale-out is enabled + [Multi-modal IPC cache](#ipc-cache) is disabled when API server scale-out is enabled because it requires a one-to-one correspondance between API and engine core processes. -### GPU-accelerated Processing +### GPU Multi-Modal Processing -You can speed up input processing by running Hugging Face processors on the GPU. +You can speed up multi-modal input processing by running Hugging Face processors on the GPU. To support this, the processor must accept a `device` argument in its call signature. As of this writing, the following processors are known to support GPU acceleration: @@ -189,3 +175,20 @@ llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", llm = LLM(model="Qwen/Qwen2-Audio-7B-Instruct", mm_processor_kwargs={"device": "cuda"}) ``` + +## Multi-Modal Caching + +### Processor Cache + +By default, the multi-modal processor cache is enabled to avoid repeatedly calling Hugging Face processors +on the same multi-modal inputs, which commonly occurs in multi-turn conversations. + +You can adjust the size of the cache via `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB). +The actual memory usage is double of this value because the cache is mirrored across API and engine core processes. + +If you do not benefit much from the cache, you can disable it completely via `disable_mm_preprocessor_cache`: + +```python +llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", + disable_mm_preprocessor_cache=True) +``` diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 9dd47e6ddee8..5eb9660cd1e8 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1230,17 +1230,17 @@ def create_engine_config( enable_multimodal_encoder_data_parallel, ) - if model_config.is_multimodal_model: - dp_supports_mm_preprocessor_cache = (self.data_parallel_size == 1 - or data_parallel_external_lb) - if (not dp_supports_mm_preprocessor_cache - and not model_config.disable_mm_preprocessor_cache): - logger.warning( - "Multi-modal preprocessor cache is not compatible " - "with data parallelism when there does not exist a " - "one-to-one correspondance between API process and " - "EngineCore process, so the cache will be disabled.") - model_config.set_disable_mm_preprocessor_cache(True) + supports_mm_preprocessor_cache = (self.data_parallel_size == 1 + or data_parallel_external_lb) + if (not supports_mm_preprocessor_cache + and model_config.is_multimodal_model + and not model_config.disable_mm_preprocessor_cache): + logger.warning( + "Multi-modal preprocessor cache is not compatible " + "with data parallelism when there does not exist a " + "one-to-one correspondance between API process and " + "EngineCore process, so the cache will be disabled.") + model_config.set_disable_mm_preprocessor_cache(True) speculative_config = self.create_speculative_config( target_model_config=model_config, From b2e58437f031f6dff2a84d4f423a5bb831772fc9 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 5 Aug 2025 08:04:09 +0000 Subject: [PATCH 039/130] Reduce diffs Signed-off-by: DarkLight1337 --- docs/configuration/conserving_memory.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md index 02c0d7e28cf0..75d19e4420f4 100644 --- a/docs/configuration/conserving_memory.md +++ b/docs/configuration/conserving_memory.md @@ -87,8 +87,6 @@ llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", If you run out of CPU RAM, try the following options: - (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB). - The actual memory usage is double of this value because the cache is mirrored across API and engine core processes. - You can also disable the cache entirely via the `disable_mm_preprocessor_cache` flag. - (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB). ## Multi-modal input limits From f48ce4f7c91f41466f7c4b85346689404faa53d3 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 5 Aug 2025 15:35:41 +0000 Subject: [PATCH 040/130] Update doc Signed-off-by: DarkLight1337 --- docs/configuration/optimization.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index 0bf5aa56773b..2d6f8b8f735d 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -176,6 +176,10 @@ llm = LLM(model="Qwen/Qwen2-Audio-7B-Instruct", mm_processor_kwargs={"device": "cuda"}) ``` +!!! warning + The speed-up from GPU processing varies from model to model. In some cases, GPU processing may even become detrimental. + Make sure you perform benchmarking before enabling this! + ## Multi-Modal Caching ### Processor Cache From a318923ba1b31a8863eee00daaf471f1d2b5e828 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 7 Aug 2025 16:53:26 +0000 Subject: [PATCH 041/130] Clean Signed-off-by: DarkLight1337 --- tests/entrypoints/openai/test_audio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index ad2bf829674c..4553cbf823af 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -248,7 +248,7 @@ async def test_single_chat_session_input_audio( @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) async def test_chat_streaming_audio(client: openai.AsyncOpenAI, model_name: str, audio_url: str): - messages = messages = dummy_messages_from_audio_url(audio_url) + messages = dummy_messages_from_audio_url(audio_url) # test single completion chat_completion = await client.chat.completions.create( From 1fc7ac859a1e5a37874d8541619b0751757a716d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 13 Aug 2025 14:39:30 +0000 Subject: [PATCH 042/130] Revert profiling changes Signed-off-by: DarkLight1337 --- docs/configuration/optimization.md | 19 ++++++++++-- vllm/multimodal/registry.py | 10 ++----- vllm/v1/worker/gpu_model_runner.py | 46 +++++----------------------- vllm/v1/worker/tpu_model_runner.py | 48 +++++------------------------- 4 files changed, 35 insertions(+), 88 deletions(-) diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index d79b952dda4c..ce49bb65f879 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -153,6 +153,13 @@ vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4 -dp 2 [Multi-modal processor cache](#processor-cache) is disabled when API server scale-out is enabled because it requires a one-to-one correspondance between API and engine core processes. +!!! warning + By default, 8 CPU threads are used in each API server to load media items (e.g. images) + from request data. + + If you apply API server scale-out, consider adjusting `VLLM_MEDIA_LOADING_THREAD_COUNT` + to avoid CPU resource exhaustion. + ### GPU Multi-Modal Processing You can speed up multi-modal input processing by running Hugging Face processors on the GPU. @@ -176,10 +183,18 @@ llm = LLM(model="Qwen/Qwen2-Audio-7B-Instruct", mm_processor_kwargs={"device": "cuda"}) ``` -!!! warning - The speed-up from GPU processing varies from model to model. In some cases, GPU processing may even become detrimental. +!!! important + The speed-up from GPU processing varies from model to model. + In some cases, GPU processing may even become detrimental because of resource contention with + the forward pass of the model. Make sure you perform benchmarking before enabling this! +!!! warning + Currently, our memory profiler does not consider the GPU usage of applying input processing. + + Make sure you reserve additional memory beyond what is normally tracked by vLLM to avoid + OOM during inference. + ## Multi-Modal Caching ### Processor Cache diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 5afa17f5fe11..ded56cca8099 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -328,16 +328,13 @@ def get_decoder_dummy_data( model_config: "ModelConfig", seq_len: int, mm_counts: Optional[Mapping[str, int]] = None, - *, - disable_cache: bool = False, ) -> DummyDecoderData: """ Create dummy data for profiling the memory usage of a model. The model is identified by ``model_config``. """ - processor = self.create_processor(model_config, - disable_cache=disable_cache) + processor = self.create_processor(model_config, disable_cache=False) profiler = MultiModalProfiler(processor) dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts) @@ -355,16 +352,13 @@ def get_encoder_dummy_data( model_config: "ModelConfig", seq_len: int, mm_counts: Optional[Mapping[str, int]] = None, - *, - disable_cache: bool = False, ) -> DummyEncoderData: """ Create dummy data for profiling the memory usage of a model. The model is identified by ``model_config``. """ - processor = self.create_processor(model_config, - disable_cache=disable_cache) + processor = self.create_processor(model_config, disable_cache=False) profiler = MultiModalProfiler(processor) dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 072c0051ca56..1a536e1a1971 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -42,7 +42,6 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargsItem, PlaceholderRange) -from vllm.multimodal.profiling import DummyDecoderData from vllm.multimodal.utils import group_mm_kwargs_by_modality from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingType @@ -653,13 +652,7 @@ def _dummy_mm_kwargs(self, num_seqs: int) -> BatchedTensorInputs: assert mm_budget is not None dummy_modality, _ = mm_budget.get_modality_with_max_tokens() - dummy_mm_data = self._get_mm_decoder_dummy_data(dummy_modality, 1) - - return self._get_mm_decoder_dummy_batch( - dummy_modality, - dummy_mm_data, - num_seqs, - ) + return self._get_mm_dummy_batch(dummy_modality, num_seqs) return {} @@ -2190,35 +2183,17 @@ def rand_input_ids() -> torch.Tensor: yield input_ids.fill_(0) - def _get_mm_decoder_dummy_data( - self, - modality: str, - max_items_per_prompt: int, - ) -> DummyDecoderData: - """Dummy data for profiling and precompiling multimodal processor.""" - model_config = self.model_config - if model_config.get_multimodal_config().is_mm_processing_gpu: - # Result in the maximum GPU consumption of HF processor - mm_counts = {modality: max_items_per_prompt} - disable_cache = True - else: - mm_counts = {modality: 1} - disable_cache = False - - return self.mm_registry.get_decoder_dummy_data( - model_config=model_config, - seq_len=self.max_num_tokens, - mm_counts=mm_counts, - disable_cache=disable_cache, - ) - - def _get_mm_decoder_dummy_batch( + def _get_mm_dummy_batch( self, modality: str, - dummy_decoder_data: DummyDecoderData, max_items_per_batch: int, ) -> BatchedTensorInputs: """Dummy data for profiling and precompiling multimodal models.""" + dummy_decoder_data = self.mm_registry.get_decoder_dummy_data( + model_config=self.model_config, + seq_len=self.max_num_tokens, + mm_counts={modality: 1}, + ) dummy_mm_data = dummy_decoder_data.multi_modal_data # Result in the maximum GPU consumption of the model @@ -2531,13 +2506,8 @@ def profile_run(self) -> None: ) # Create dummy batch of multimodal inputs. - dummy_mm_data = self._get_mm_decoder_dummy_data( - dummy_modality, - max_mm_items_per_prompt, - ) - batched_dummy_mm_inputs = self._get_mm_decoder_dummy_batch( + batched_dummy_mm_inputs = self._get_mm_dummy_batch( dummy_modality, - dummy_mm_data, max_mm_items_per_batch, ) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index ab1207b33651..46262284e333 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -34,7 +34,6 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargsItem, PlaceholderRange) -from vllm.multimodal.profiling import DummyDecoderData from vllm.multimodal.utils import group_mm_kwargs_by_modality from vllm.sequence import IntermediateTensors from vllm.tasks import GenerationTask, PoolingTask, SupportedTask @@ -1317,15 +1316,8 @@ def _precompile_mm_encoder(self) -> None: assert mm_budget is not None max_items_per_seq_by_modality = mm_budget.max_items_per_batch_by_modality # noqa: E501 - max_items_per_prompt_by_modality = mm_budget.max_items_per_prompt_by_modality # noqa: E501 for mode, max_items_per_seq in max_items_per_seq_by_modality.items(): - max_items_per_prompt = max_items_per_prompt_by_modality[mode] - dummy_mm_data = self._get_mm_decoder_dummy_data( - mode, - max_items_per_prompt, - ) - logger.info( "Compiling Multimodal %s Encoder with different input" " shapes.", mode) @@ -1333,9 +1325,8 @@ def _precompile_mm_encoder(self) -> None: # No padding for MM encoder just yet. for num_items in range(1, max_items_per_seq + 1): logger.info(" -- mode: %s items: %d", mode, num_items) - batched_dummy_mm_inputs = self._get_mm_decoder_dummy_batch( + batched_dummy_mm_inputs = self._get_mm_dummy_batch( mode, - dummy_mm_data, num_items, ) # Run multimodal encoder. @@ -1565,13 +1556,8 @@ def profile_run( ) # Create dummy batch of multimodal inputs. - dummy_mm_data = self._get_mm_decoder_dummy_data( - dummy_modality, - max_mm_items_per_prompt, - ) - batched_dummy_mm_inputs = self._get_mm_decoder_dummy_batch( + batched_dummy_mm_inputs = self._get_mm_dummy_batch( dummy_modality, - dummy_mm_data, max_mm_items_per_batch, ) @@ -1818,35 +1804,17 @@ def prepare_structured_decoding_input( self.grammar_bitmask_cpu[:num_reqs].to(logits.device), \ self.structured_decode_arange.to(logits.device) - def _get_mm_decoder_dummy_data( + def _get_mm_dummy_batch( self, modality: str, - max_items_per_prompt: int, - ) -> DummyDecoderData: - """Dummy data for profiling and precompiling multimodal processor.""" - model_config = self.model_config - if model_config.get_multimodal_config().is_mm_processing_gpu: - # Result in the maximum GPU consumption of HF processor - mm_counts = {modality: max_items_per_prompt} - disable_cache = True - else: - mm_counts = {modality: 1} - disable_cache = False - - return self.mm_registry.get_decoder_dummy_data( - model_config=model_config, - seq_len=self.max_num_tokens, - mm_counts=mm_counts, - disable_cache=disable_cache, - ) - - def _get_mm_decoder_dummy_batch( - self, - modality: str, - dummy_decoder_data: DummyDecoderData, max_items_per_batch: int, ) -> BatchedTensorInputs: """Dummy data for profiling and precompiling multimodal models.""" + dummy_decoder_data = self.mm_registry.get_decoder_dummy_data( + model_config=self.model_config, + seq_len=self.max_num_tokens, + mm_counts={modality: 1}, + ) dummy_mm_data = dummy_decoder_data.multi_modal_data # Result in the maximum GPU consumption of the model From aa8bdb9945cbf7e79616b18fc4bfe365e2dfb0bd Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 13 Aug 2025 15:59:23 +0000 Subject: [PATCH 043/130] Try profiling processing Signed-off-by: DarkLight1337 --- docs/configuration/optimization.md | 6 ---- vllm/utils/__init__.py | 52 +++++++++++++++++++++--------- vllm/v1/worker/gpu_model_runner.py | 42 ++++++++++++++++++++++-- vllm/v1/worker/gpu_worker.py | 5 +-- 4 files changed, 79 insertions(+), 26 deletions(-) diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index ce49bb65f879..a025560a5b3f 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -189,12 +189,6 @@ llm = LLM(model="Qwen/Qwen2-Audio-7B-Instruct", the forward pass of the model. Make sure you perform benchmarking before enabling this! -!!! warning - Currently, our memory profiler does not consider the GPU usage of applying input processing. - - Make sure you reserve additional memory beyond what is normally tracked by vLLM to avoid - OOM during inference. - ## Multi-Modal Caching ### Processor Cache diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 095829db8394..de0ab0cca4c8 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -2621,26 +2621,41 @@ class MemoryProfilingResult: torch_peak_increase: int = 0 non_torch_increase: int = 0 weights_memory: float = 0 + processing_memory: int = 0 before_create: MemorySnapshot = field(default_factory=MemorySnapshot) before_profile: MemorySnapshot = field(default_factory=MemorySnapshot) after_profile: MemorySnapshot = field(default_factory=MemorySnapshot) profile_time: float = 0.0 def __repr__(self) -> str: - return (f"Memory profiling takes {self.profile_time:.2f} seconds. " - f"Total non KV cache memory: " - f"{(self.non_kv_cache_memory / GiB_bytes):.2f}GiB; " - f"torch peak memory increase: " - f"{(self.torch_peak_increase / GiB_bytes):.2f}GiB; " - f"non-torch forward increase memory: " - f"{(self.non_torch_increase / GiB_bytes):.2f}GiB; " - f"weights memory: {(self.weights_memory / GiB_bytes):.2f}GiB.") + summary = f"Memory profiling takes {self.profile_time:.2f} seconds." + + detail = list[str]() + for title, value in [ + "Total non KV cache memory", + self.non_kv_cache_memory, + "torch peak memory increase", + self.torch_peak_increase, + "non-torch forward increase memory", + self.non_torch_increase, + "model weights memory", + self.weights_memory, + "input processing memory", + self.processing_memory, + ]: + if value > 0: + detail.append(f"{title}: {value / GiB_bytes:.2f}GiB") + + return f"{summary} {'; '.join(detail)}." @contextlib.contextmanager def memory_profiling( - baseline_snapshot: MemorySnapshot, - weights_memory: int) -> Generator[MemoryProfilingResult, None, None]: + baseline_snapshot: MemorySnapshot, + weights_memory: int, + *, + processing_memory: int = 0, +) -> Generator[MemoryProfilingResult, None, None]: """Memory profiling context manager. baseline_snapshot: the memory snapshot before the current vLLM instance. weights_memory: memory used by PyTorch when loading the model weights. @@ -2691,11 +2706,13 @@ def memory_profiling( torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() - result = MemoryProfilingResult() - - result.before_create = baseline_snapshot - # the part of memory used for holding the model weights - result.weights_memory = weights_memory + result = MemoryProfilingResult( + before_create=baseline_snapshot, + # the part of memory used for holding the model weights + weights_memory=weights_memory, + # the part of memory used for input processing + processing_memory=processing_memory, + ) result.before_profile.measure() @@ -2711,7 +2728,10 @@ def memory_profiling( result.torch_peak_increase = diff_profile.torch_peak result.non_torch_increase = diff_from_create.non_torch_memory result.profile_time = diff_profile.timestamp - result.non_kv_cache_memory = result.non_torch_increase + result.torch_peak_increase + result.weights_memory # noqa + result.non_kv_cache_memory = (result.non_torch_increase + + result.torch_peak_increase + + result.weights_memory + + result.processing_memory) # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501 diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 1a536e1a1971..52e719d95f35 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -48,8 +48,9 @@ from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, - GiB_bytes, LazyLoader, check_use_alibi, get_dtype_size, - is_pin_memory_available, round_up, supports_dynamo) + GiB_bytes, LazyLoader, MemorySnapshot, check_use_alibi, + get_dtype_size, is_pin_memory_available, round_up, + supports_dynamo) from vllm.v1.attention.backends.mamba_selectors import get_mamba_attn_backend from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, @@ -2022,6 +2023,43 @@ def load_model(self, eep_scale_up: bool = False) -> None: fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, backend=backend) + # We don't call this from the worker because + # not all model runners support this + self.maybe_profile_processing() + + def maybe_profile_processing(self) -> None: + model_config = self.model_config + mm_config = model_config.multimodal_config + + if mm_config and mm_config.is_mm_processing_gpu: + self.mm_registry.reset_processor_cache(model_config) + + mm_budget = self.mm_budget + assert mm_budget is not None + + time_before_processing = time.perf_counter() + before_profile = MemorySnapshot(auto_measure=True) + + self.mm_registry.get_decoder_dummy_data( + model_config=model_config, + seq_len=self.max_num_tokens, + mm_counts=mm_budget.max_items_per_batch_by_modality, + ) + + time_after_processing = time.perf_counter() + after_profile = MemorySnapshot(auto_measure=True) + + diff_profile = after_profile - before_profile + + # TODO: Multiply this by API server count + self.processor_memory_usage = diff_profile.torch_peak + + logger.info("Input processing took %.4f GiB and %.6f seconds", + self.processor_memory_usage / GiB_bytes, + time_after_processing - time_before_processing) + else: + self.processor_memory_usage = 0 + def reload_weights(self) -> None: assert getattr(self, "model", None) is not None, \ "Cannot reload weights before model is loaded." diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 0ea23921a080..9558365e51ed 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -239,8 +239,9 @@ def determine_available_memory(self) -> int: # of the model. with memory_profiling( self.init_snapshot, - weights_memory=int( - self.model_runner.model_memory_usage)) as profile_result: + weights_memory=int(self.model_runner.model_memory_usage), + processing_memory=self.model_runner.processor_memory_usage, + ) as profile_result: self.model_runner.profile_run() free_gpu_memory = profile_result.after_profile.free_memory From 6d51d82301649e5dc902c89a6700e576477efb39 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 13 Aug 2025 18:32:27 +0000 Subject: [PATCH 044/130] Be more precise Signed-off-by: DarkLight1337 --- docs/configuration/optimization.md | 4 ++++ vllm/config/parallel.py | 4 ++++ vllm/v1/engine/core_client.py | 2 ++ vllm/v1/worker/gpu_model_runner.py | 23 ++++++++++++++++------- 4 files changed, 26 insertions(+), 7 deletions(-) diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index a025560a5b3f..b58349a0368a 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -189,6 +189,10 @@ llm = LLM(model="Qwen/Qwen2-Audio-7B-Instruct", the forward pass of the model. Make sure you perform benchmarking before enabling this! +!!! note + Additional memory needs to be reserved for GPU multi-modal processing, + so there is less memory left for your model and KV caching. + ## Multi-Modal Caching ### Processor Cache diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index bac1e63800d7..a59b2aa9d279 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -71,6 +71,10 @@ class ParallelConfig: between local data parallel ranks, but an external LB balances between vLLM nodes/replicas. Set explicitly in conjunction with --data-parallel-start-rank.""" + + _api_server_count: int = 1 + """Set internally to indicate how many API processes are initialized.""" + enable_expert_parallel: bool = False """Use expert parallelism instead of tensor parallelism for MoE layers.""" enable_eplb: bool = False diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 05b4d7260896..64207795ea3d 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -91,6 +91,8 @@ def make_async_mp_client( client_index: int = 0, ) -> "MPClient": parallel_config = vllm_config.parallel_config + parallel_config._api_server_count = client_count + client_args = (vllm_config, executor_class, log_stats, client_addresses, client_count, client_index) if parallel_config.data_parallel_size > 1: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 52e719d95f35..fa8f66e63a7f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2037,22 +2037,31 @@ def maybe_profile_processing(self) -> None: mm_budget = self.mm_budget assert mm_budget is not None + gc.collect() + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + time_before_processing = time.perf_counter() before_profile = MemorySnapshot(auto_measure=True) - self.mm_registry.get_decoder_dummy_data( - model_config=model_config, - seq_len=self.max_num_tokens, - mm_counts=mm_budget.max_items_per_batch_by_modality, - ) + for modality, max_items_per_batch in ( + mm_budget.max_items_per_batch_by_modality.items()): + self.mm_registry.get_decoder_dummy_data( + model_config=model_config, + seq_len=self.max_num_tokens, + mm_counts={modality: max_items_per_batch}, + ) + + gc.collect() + torch.cuda.empty_cache() time_after_processing = time.perf_counter() after_profile = MemorySnapshot(auto_measure=True) diff_profile = after_profile - before_profile - # TODO: Multiply this by API server count - self.processor_memory_usage = diff_profile.torch_peak + self.processor_memory_usage = diff_profile.torch_peak * ( + self.parallel_config._api_server_count) logger.info("Input processing took %.4f GiB and %.6f seconds", self.processor_memory_usage / GiB_bytes, From 6aae2175dfc5f7b083f431116f7c95005025a9c0 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 13 Aug 2025 19:38:48 +0000 Subject: [PATCH 045/130] Try to auto-map GPU processor Signed-off-by: DarkLight1337 --- vllm/config/parallel.py | 4 ---- vllm/entrypoints/cli/serve.py | 38 +++++++++++++++++++++++++++++- vllm/v1/engine/core_client.py | 3 +-- vllm/v1/worker/gpu_model_runner.py | 4 +--- 4 files changed, 39 insertions(+), 10 deletions(-) diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index a59b2aa9d279..bac1e63800d7 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -71,10 +71,6 @@ class ParallelConfig: between local data parallel ranks, but an external LB balances between vLLM nodes/replicas. Set explicitly in conjunction with --data-parallel-start-rank.""" - - _api_server_count: int = 1 - """Set internally to indicate how many API processes are initialized.""" - enable_expert_parallel: bool = False """Use expert parallelism instead of tensor parallelism for MoE layers.""" enable_eplb: bool = False diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 803a3e004656..b5ca2998e083 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -17,6 +17,7 @@ from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG, show_filtered_argument_or_group_from_help) from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm.usage.usage_lib import UsageContext from vllm.utils import (FlexibleArgumentParser, decorate_logs, get_tcp_uri, set_process_title) @@ -224,9 +225,44 @@ def run_api_server_worker_proc(listen_address, # Set process title and add process-specific prefix to stdout and stderr. server_index = client_config.get("client_index", 0) if client_config else 0 - set_process_title("APIServer", str(server_index)) + process_name = set_process_title("APIServer", str(server_index)) decorate_logs() + # Try to run GPU processing on different devices for each API server + if mm_kwargs := args.mm_processor_kwargs: + mm_device: str = mm_kwargs.get("device", "cpu") + if mm_device != "cpu": + if mm_device == current_platform.device_type: + engine_device_count = max( + args.tensor_parallel_size or 1, + ((args.data_parallel_size or 1) + if args.data_parallel_size_local == 0 else min( + args.data_parallel_size or 1, + args.data_parallel_size_local or 1, + )), + ) + available_device_count = \ + current_platform.device_count() # type: ignore + + # Try to run processing on GPUs that are not used by the engine + device_idx = ((engine_device_count + server_index) % + available_device_count) + new_mm_device = f"{current_platform.device_name}:{device_idx}" + if new_mm_device != mm_device: + logger.info("Multi-modal processor is mapped to device %s", + process_name, new_mm_device) + + args.mm_processor_kwargs["device"] = new_mm_device + elif not mm_device.endswith(":0"): + logger.warning( + "You set a specific device %s for multi-modal processor " + "which is not on rank 0. " + "This potentially leads to OOM during inference because " + "vLLM's memory profiling for input processing is only run " + "on rank 0.", + mm_device, + ) + uvloop.run( run_server_worker(listen_address, sock, args, client_config, **uvicorn_kwargs)) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 64207795ea3d..97506109debb 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -91,8 +91,6 @@ def make_async_mp_client( client_index: int = 0, ) -> "MPClient": parallel_config = vllm_config.parallel_config - parallel_config._api_server_count = client_count - client_args = (vllm_config, executor_class, log_stats, client_addresses, client_count, client_index) if parallel_config.data_parallel_size > 1: @@ -739,6 +737,7 @@ def __init__(self, client_addresses=client_addresses, ) + self.client_count = client_count self.client_index = client_index self.outputs_queue = asyncio.Queue[Union[EngineCoreOutputs, Exception]]() diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index fa8f66e63a7f..11c2e0042b8a 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2059,9 +2059,7 @@ def maybe_profile_processing(self) -> None: after_profile = MemorySnapshot(auto_measure=True) diff_profile = after_profile - before_profile - - self.processor_memory_usage = diff_profile.torch_peak * ( - self.parallel_config._api_server_count) + self.processor_memory_usage = diff_profile.torch_peak logger.info("Input processing took %.4f GiB and %.6f seconds", self.processor_memory_usage / GiB_bytes, From 55c5e1b4332ba196ca07c36830cfad83d02f2446 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 04:55:48 +0000 Subject: [PATCH 046/130] Fix Signed-off-by: DarkLight1337 --- vllm/entrypoints/cli/serve.py | 11 ++++++----- vllm/v1/worker/gpu_model_runner.py | 6 +++--- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index b5ca2998e083..f08d993361d3 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -225,7 +225,7 @@ def run_api_server_worker_proc(listen_address, # Set process title and add process-specific prefix to stdout and stderr. server_index = client_config.get("client_index", 0) if client_config else 0 - process_name = set_process_title("APIServer", str(server_index)) + set_process_title("APIServer", str(server_index)) decorate_logs() # Try to run GPU processing on different devices for each API server @@ -249,14 +249,15 @@ def run_api_server_worker_proc(listen_address, available_device_count) new_mm_device = f"{current_platform.device_name}:{device_idx}" if new_mm_device != mm_device: - logger.info("Multi-modal processor is mapped to device %s", - process_name, new_mm_device) + logger.info( + "Multi-modal processor will be run on device %s", + new_mm_device) args.mm_processor_kwargs["device"] = new_mm_device elif not mm_device.endswith(":0"): logger.warning( - "You set a specific device %s for multi-modal processor " - "which is not on rank 0. " + "You assigned the multi-modal processor to a specific " + "device %s which is not on rank 0. " "This potentially leads to OOM during inference because " "vLLM's memory profiling for input processing is only run " "on rank 0.", diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 11c2e0042b8a..c9a88634dd03 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2044,12 +2044,12 @@ def maybe_profile_processing(self) -> None: time_before_processing = time.perf_counter() before_profile = MemorySnapshot(auto_measure=True) - for modality, max_items_per_batch in ( - mm_budget.max_items_per_batch_by_modality.items()): + for modality, max_items_per_prompt in ( + mm_budget.max_items_per_prompt_by_modality.items()): self.mm_registry.get_decoder_dummy_data( model_config=model_config, seq_len=self.max_num_tokens, - mm_counts={modality: max_items_per_batch}, + mm_counts={modality: max_items_per_prompt}, ) gc.collect() From bceb6bd84f76f1f576b8e026dd25d41d4b40c7b0 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 07:47:57 +0000 Subject: [PATCH 047/130] Update Signed-off-by: DarkLight1337 --- .../test_api_server_process_manager.py | 6 +- vllm/config/__init__.py | 27 ++++- vllm/engine/arg_utils.py | 4 + vllm/entrypoints/cli/serve.py | 114 +++++++++++------- vllm/v1/utils.py | 28 +++-- vllm/v1/worker/gpu_model_runner.py | 3 +- 6 files changed, 123 insertions(+), 59 deletions(-) diff --git a/tests/entrypoints/test_api_server_process_manager.py b/tests/entrypoints/test_api_server_process_manager.py index e4af60a78265..882382a38543 100644 --- a/tests/entrypoints/test_api_server_process_manager.py +++ b/tests/entrypoints/test_api_server_process_manager.py @@ -36,10 +36,10 @@ def api_server_args(): "localhost:8000", "sock": sock, - "args": - "test_args", # Simple string to avoid pickling issues "num_servers": 3, + "args_per_server": + ["test_args"] * 3, # Simple string to avoid pickling issues "input_addresses": [ "tcp://127.0.0.1:5001", "tcp://127.0.0.1:5002", "tcp://127.0.0.1:5003" @@ -60,7 +60,7 @@ def test_api_server_process_manager_init(api_server_args, with_stats_update): global WORKER_RUNTIME_SECONDS WORKER_RUNTIME_SECONDS = 0.5 - # Copy the args to avoid mutating the + # Copy the args to avoid mutating them args = api_server_args.copy() if not with_stats_update: diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index d3ac0080d761..b5807ccd7325 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -422,6 +422,13 @@ class ModelConfig: `mm_processor_cache_gb * (api_server_count + data_parallel_size)`. Set to `0` to disable this cache completely (not recommended).""" + mm_processors_per_gpu: int = 1 + """ + [Internal] The maximum number of multi-modal processors that use each GPU. + + This is needed to determine the peak memory of multi-modal processing + in the case of API server scale-out. + """ override_neuron_config: dict[str, Any] = field(default_factory=dict) """Initialize non-default neuron config or override default neuron config that are specific to Neuron devices, this argument will be used to @@ -842,10 +849,16 @@ def _init_multimodal_config(self) -> Optional["MultiModalConfig"]: return None def set_mm_processor_cache_gb(self, value: int) -> None: - mm_config = self.get_multimodal_config() - self.mm_processor_cache_gb = value - mm_config.mm_processor_cache_gb = value + + if mm_config := self.multimodal_config: + mm_config.mm_processor_cache_gb = value + + def set_mm_processors_per_gpu(self, value: int) -> None: + self.mm_processors_per_gpu = value + + if mm_config := self.multimodal_config: + mm_config.mm_processors_per_gpu = value def _get_encoder_config(self): return get_sentence_transformer_tokenizer_config( @@ -2506,6 +2519,14 @@ class MultiModalConfig: Set to `0` to disable this cache completely (not recommended). """ + mm_processors_per_gpu: int = 1 + """ + [Internal] The maximum number of multi-modal processors that use each GPU. + + This is needed to determine the peak memory of multi-modal processing + in the case of API server scale-out. + """ + interleave_mm_strings: bool = False """ Enable fully interleaved support for multimodal prompts. diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c058001ceb97..8034d8375b7d 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -350,6 +350,7 @@ class EngineArgs: MultiModalConfig.mm_processor_kwargs disable_mm_preprocessor_cache: bool = False # DEPRECATED mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb + mm_processors_per_gpu: int = MultiModalConfig.mm_processors_per_gpu # LoRA fields enable_lora: bool = False enable_lora_bias: bool = LoRAConfig.bias_enabled @@ -710,6 +711,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: multimodal_group.add_argument( "--mm-processor-cache-gb", **multimodal_kwargs["mm_processor_cache_gb"]) + multimodal_group.add_argument( + "--mm_processors-per-gpu", + **multimodal_kwargs["mm_processors_per_gpu"]) multimodal_group.add_argument("--disable-mm-preprocessor-cache", type=bool, deprecated=True) diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index f08d993361d3..f3d474710b27 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -3,7 +3,9 @@ import argparse import signal -from typing import Optional +from collections import Counter +from copy import deepcopy +from typing import Any, Optional import uvloop @@ -133,13 +135,29 @@ def signal_handler(signum, frame): engine_manager.close() +def _available_device_count() -> int: + return current_platform.device_count() # type: ignore + + +def _estimate_engine_device_count(args: argparse.Namespace) -> int: + tp_size = args.tensor_parallel_size or 1 + pp_size = args.pipeline_parallel_size or 1 + + dp_size = args.data_parallel_size or 1 + if args.data_parallel_size_local: + dp_size = min(dp_size, args.data_parallel_size_local) + + # This is a conservative estimate since it assumes single node + return min(tp_size * pp_size * dp_size, _available_device_count()) + + def run_multi_api_server(args: argparse.Namespace): assert not args.headless - num_api_servers = args.api_server_count + num_api_servers: int = args.api_server_count assert num_api_servers > 0 - orig_mm_processor_cache_gb = args.mm_processor_cache_gb + orig_mm_processor_cache_gb: int = args.mm_processor_cache_gb if num_api_servers > 1: setup_multiprocess_prometheus() @@ -175,6 +193,56 @@ def run_multi_api_server(args: argparse.Namespace): hybrid_dp_lb = parallel_config.data_parallel_hybrid_lb assert external_dp_lb or hybrid_dp_lb or dp_rank == 0 + args_per_server = [deepcopy(args) for _ in range(num_api_servers)] + + if mm_processor_kwargs := args.mm_processor_kwargs: + mm_device: str = mm_processor_kwargs.get("device", "cpu") + if mm_device != "cpu": + engine_device_count = _estimate_engine_device_count(args) + available_device_count = _available_device_count() + + engine_gpu_idxs = list(range(engine_device_count)) + + device_type, *rest = mm_device.rsplit(":", 1) + if len(rest) == 0: + # Try to run processing on GPUs that are not used by the engine + processor_gpu_idxs = [ + (engine_device_count + server_idx) % available_device_count + for server_idx in range(num_api_servers) + ] + for server_idx, device_idx in enumerate(processor_gpu_idxs): + device = f"{device_type}:{device_idx}" + args_per_server[server_idx].mm_processor_kwargs["device"] \ + = device + + logger.info( + "Multi-modal processor in APIServer_%s will be run on " + "device %s", server_idx, device) + else: + (device_idx, ) = map(int, rest) + processor_gpu_idxs = [device_idx] * num_api_servers + + processor_engine_gpu_idxs = [ + gpu_idx for gpu_idx in processor_gpu_idxs + if gpu_idx in engine_gpu_idxs + ] + mm_processors_per_gpu = max( + Counter(processor_engine_gpu_idxs).values(), + default=1, + ) + + # NOTE: vllm_config is used to initialize EngineCore while + # args_per_server is used to initialize API processes + vllm_config.model_config.set_mm_processors_per_gpu( + mm_processors_per_gpu) + for server_idx in range(num_api_servers): + args_per_server[server_idx].mm_processors_per_gpu \ + = mm_processors_per_gpu + + logger.info( + "Each GPU is shared by at most %d multi-modal processors", + mm_processors_per_gpu) + api_server_manager: Optional[APIServerProcessManager] = None with launch_core_engines(vllm_config, executor_class, log_stats, @@ -182,12 +250,12 @@ def run_multi_api_server(args: argparse.Namespace): coordinator, addresses): # Construct common args for the APIServerProcessManager up-front. - api_server_manager_kwargs = dict( + api_server_manager_kwargs = dict[str, Any]( target_server_fn=run_api_server_worker_proc, listen_address=listen_address, sock=sock, - args=args, num_servers=num_api_servers, + args_per_server=args_per_server, input_addresses=addresses.inputs, output_addresses=addresses.outputs, stats_update_address=coordinator.get_stats_publish_address() @@ -228,42 +296,6 @@ def run_api_server_worker_proc(listen_address, set_process_title("APIServer", str(server_index)) decorate_logs() - # Try to run GPU processing on different devices for each API server - if mm_kwargs := args.mm_processor_kwargs: - mm_device: str = mm_kwargs.get("device", "cpu") - if mm_device != "cpu": - if mm_device == current_platform.device_type: - engine_device_count = max( - args.tensor_parallel_size or 1, - ((args.data_parallel_size or 1) - if args.data_parallel_size_local == 0 else min( - args.data_parallel_size or 1, - args.data_parallel_size_local or 1, - )), - ) - available_device_count = \ - current_platform.device_count() # type: ignore - - # Try to run processing on GPUs that are not used by the engine - device_idx = ((engine_device_count + server_index) % - available_device_count) - new_mm_device = f"{current_platform.device_name}:{device_idx}" - if new_mm_device != mm_device: - logger.info( - "Multi-modal processor will be run on device %s", - new_mm_device) - - args.mm_processor_kwargs["device"] = new_mm_device - elif not mm_device.endswith(":0"): - logger.warning( - "You assigned the multi-modal processor to a specific " - "device %s which is not on rank 0. " - "This potentially leads to OOM during inference because " - "vLLM's memory profiling for input processing is only run " - "on rank 0.", - mm_device, - ) - uvloop.run( run_server_worker(listen_address, sock, args, client_config, **uvicorn_kwargs)) diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index b5750c82db02..f35178da3977 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -123,8 +123,8 @@ def __init__( target_server_fn: Callable, listen_address: str, sock: Any, - args: argparse.Namespace, num_servers: int, + args_per_server: list[argparse.Namespace], input_addresses: list[str], output_addresses: list[str], stats_update_address: Optional[str] = None, @@ -141,29 +141,35 @@ def __init__( output_addresses: Output addresses for each API server stats_update_address: Optional stats update address """ + if len(args_per_server) != num_servers: + raise ValueError(f"Incorrect {len(args_per_server)=}") + if len(input_addresses) != num_servers: + raise ValueError(f"Incorrect {len(input_addresses)=}") + if len(output_addresses) != num_servers: + raise ValueError(f"Incorrect {len(output_addresses)=}") + self.listen_address = listen_address self.sock = sock - self.args = args # Start API servers spawn_context = multiprocessing.get_context("spawn") self.processes: list[BaseProcess] = [] - for i, in_addr, out_addr in zip(range(num_servers), input_addresses, - output_addresses): + for i in range(num_servers): client_config = { - "input_address": in_addr, - "output_address": out_addr, + "input_address": input_addresses[i], + "output_address": output_addresses[i], "client_count": num_servers, - "client_index": i + "client_index": i, } if stats_update_address is not None: client_config["stats_update_address"] = stats_update_address - proc = spawn_context.Process(target=target_server_fn, - name=f"ApiServer_{i}", - args=(listen_address, sock, args, - client_config)) + proc = spawn_context.Process( + target=target_server_fn, + name=f"ApiServer_{i}", + args=(listen_address, sock, args_per_server[i], client_config), + ) self.processes.append(proc) proc.start() diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c9a88634dd03..291d695e4c87 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2059,7 +2059,8 @@ def maybe_profile_processing(self) -> None: after_profile = MemorySnapshot(auto_measure=True) diff_profile = after_profile - before_profile - self.processor_memory_usage = diff_profile.torch_peak + self.processor_memory_usage = diff_profile.torch_peak * ( + mm_config.mm_processors_per_gpu) logger.info("Input processing took %.4f GiB and %.6f seconds", self.processor_memory_usage / GiB_bytes, From 8e59dc89281b93413d0df4643afab7a3571b14bb Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 07:52:12 +0000 Subject: [PATCH 048/130] Update docs Signed-off-by: DarkLight1337 --- docs/configuration/optimization.md | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index b58349a0368a..1ec9fb79d6a1 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -183,15 +183,16 @@ llm = LLM(model="Qwen/Qwen2-Audio-7B-Instruct", mm_processor_kwargs={"device": "cuda"}) ``` -!!! important - The speed-up from GPU processing varies from model to model. - In some cases, GPU processing may even become detrimental because of resource contention with - the forward pass of the model. - Make sure you perform benchmarking before enabling this! - !!! note - Additional memory needs to be reserved for GPU multi-modal processing, - so there is less memory left for your model and KV caching. + vLLM will try to allocate visible GPUs that are not used by the core engine + for multi-modal processing. If this is not possible, then the same GPU + will be used for multi-modal processing and model forward pass, resulting + in resource contention (both I/O and memory capacity). + +!!! important + The performance improvement from GPU processing varies from model to model. + In some cases, GPU processing may even become detrimental because of resource contention. + Make sure to perform benchmarking before enabling this! ## Multi-Modal Caching From 646ba9367e4b34495bc1827d082be773ae79558d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 10:59:31 +0000 Subject: [PATCH 049/130] Update Signed-off-by: DarkLight1337 --- vllm/config/__init__.py | 28 ++++++++--- vllm/config/parallel.py | 6 +++ vllm/engine/arg_utils.py | 46 ++++++++++++++--- vllm/entrypoints/cli/serve.py | 79 ++---------------------------- vllm/multimodal/utils.py | 63 ++++++++++++++++++++++++ vllm/v1/worker/gpu_model_runner.py | 7 +-- 6 files changed, 137 insertions(+), 92 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index b5807ccd7325..974fee64bd9d 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -422,9 +422,10 @@ class ModelConfig: `mm_processor_cache_gb * (api_server_count + data_parallel_size)`. Set to `0` to disable this cache completely (not recommended).""" - mm_processors_per_gpu: int = 1 + mm_processors_per_engine_gpu: int = 0 """ - [Internal] The maximum number of multi-modal processors that use each GPU. + [Internal] The maximum number of multi-modal processors that use each GPU + in vLLM engine. This is needed to determine the peak memory of multi-modal processing in the case of API server scale-out. @@ -854,11 +855,23 @@ def set_mm_processor_cache_gb(self, value: int) -> None: if mm_config := self.multimodal_config: mm_config.mm_processor_cache_gb = value - def set_mm_processors_per_gpu(self, value: int) -> None: - self.mm_processors_per_gpu = value + def set_mm_processor_kwargs(self, value: dict[str, Any]) -> None: + if self.mm_processor_kwargs is None: + self.mm_processor_kwargs = {} + + self.mm_processor_kwargs.update(value) + + if mm_config := self.multimodal_config: + if mm_config.mm_processor_kwargs is None: + mm_config.mm_processor_kwargs = {} + + mm_config.mm_processor_kwargs.update(value) + + def set_mm_processors_per_engine_gpu(self, value: int) -> None: + self.mm_processors_per_engine_gpu = value if mm_config := self.multimodal_config: - mm_config.mm_processors_per_gpu = value + mm_config.mm_processors_per_engine_gpu = value def _get_encoder_config(self): return get_sentence_transformer_tokenizer_config( @@ -2519,9 +2532,10 @@ class MultiModalConfig: Set to `0` to disable this cache completely (not recommended). """ - mm_processors_per_gpu: int = 1 + mm_processors_per_engine_gpu: int = -1 """ - [Internal] The maximum number of multi-modal processors that use each GPU. + [Internal] The maximum number of multi-modal processors that use each GPU + in vLLM engine. A value of `-1` means not set. This is needed to determine the peak memory of multi-modal processing in the case of API server scale-out. diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index bac1e63800d7..b03999e3b91b 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -71,6 +71,12 @@ class ParallelConfig: between local data parallel ranks, but an external LB balances between vLLM nodes/replicas. Set explicitly in conjunction with --data-parallel-start-rank.""" + + api_process_count: int = 1 + """[Internal] The number of API processes initialized.""" + api_process_rank: int = 0 + """[Internal] The rank of this API process.""" + enable_expert_parallel: bool = False """Use expert parallelism instead of tensor parallelism for MoE layers.""" enable_eplb: bool = False diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 8034d8375b7d..e7ac8aea7059 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -302,6 +302,8 @@ class EngineArgs: data_parallel_rpc_port: Optional[int] = None data_parallel_hybrid_lb: bool = False data_parallel_backend: str = ParallelConfig.data_parallel_backend + api_process_count: int = ParallelConfig.api_process_count + api_process_rank: int = ParallelConfig.api_process_rank enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel enable_eplb: bool = ParallelConfig.enable_eplb num_redundant_experts: int = ParallelConfig.num_redundant_experts @@ -350,7 +352,8 @@ class EngineArgs: MultiModalConfig.mm_processor_kwargs disable_mm_preprocessor_cache: bool = False # DEPRECATED mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb - mm_processors_per_gpu: int = MultiModalConfig.mm_processors_per_gpu + mm_processors_per_engine_gpu: int = \ + MultiModalConfig.mm_processors_per_engine_gpu # LoRA fields enable_lora: bool = False enable_lora_bias: bool = LoRAConfig.bias_enabled @@ -713,7 +716,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: **multimodal_kwargs["mm_processor_cache_gb"]) multimodal_group.add_argument( "--mm_processors-per-gpu", - **multimodal_kwargs["mm_processors_per_gpu"]) + **multimodal_kwargs["mm_processors_per_engine_gpu"]) multimodal_group.add_argument("--disable-mm-preprocessor-cache", type=bool, deprecated=True) @@ -858,7 +861,10 @@ def from_cli_args(cls, args: argparse.Namespace): # Get the list of attributes of this dataclass. attrs = [attr.name for attr in dataclasses.fields(cls)] # Set the attributes from the parsed arguments. - engine_args = cls(**{attr: getattr(args, attr) for attr in attrs}) + engine_args = cls(**{ + attr: getattr(args, attr) + for attr in attrs if hasattr(args, attr) + }) return engine_args def create_model_config(self) -> ModelConfig: @@ -1219,6 +1225,8 @@ def create_engine_config( data_parallel_rpc_port=data_parallel_rpc_port, data_parallel_backend=self.data_parallel_backend, data_parallel_hybrid_lb=self.data_parallel_hybrid_lb, + api_process_count=self.api_process_count, + api_process_rank=self.api_process_rank, enable_expert_parallel=self.enable_expert_parallel, enable_eplb=self.enable_eplb, num_redundant_experts=self.num_redundant_experts, @@ -1238,17 +1246,41 @@ def create_engine_config( ) if model_config.is_multimodal_model: - dp_supports_mm_processor_cache = (self.data_parallel_size == 1 - or data_parallel_external_lb) - if (not dp_supports_mm_processor_cache + supports_mm_processor_cache = self.api_process_count == 1 and ( + self.data_parallel_size == 1 or data_parallel_external_lb) + if (not supports_mm_processor_cache and model_config.mm_processor_cache_gb > 0): logger.warning( "Multi-modal processor cache is disabled because " - "it is not compatible with data parallelism when " "there does not exist a one-to-one correspondance " "between API and engine core processes.") model_config.set_mm_processor_cache_gb(0) + if mm_processor_kwargs := self.mm_processor_kwargs: + from vllm.multimodal.utils import allocate_gpu_mm_processors + + mm_processor_device: str = mm_processor_kwargs.get( + "device", "cpu") + if mm_processor_device != "cpu": + ( + gpu_allocation, + mm_processors_per_engine_gpu, + ) = allocate_gpu_mm_processors( + mm_processor_device, + self.api_process_count, + world_size=parallel_config.world_size_across_dp, + ) + + new_device = gpu_allocation[self.api_process_rank] + logger.info( + "Multi-modal processor will be run on device %s", + new_device) + + model_config.set_mm_processor_kwargs( + {"device": new_device}) + model_config.set_mm_processors_per_engine_gpu( + mm_processors_per_engine_gpu) + speculative_config = self.create_speculative_config( target_model_config=model_config, target_parallel_config=parallel_config, diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index f3d474710b27..95a8346bd736 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -3,7 +3,6 @@ import argparse import signal -from collections import Counter from copy import deepcopy from typing import Any, Optional @@ -19,7 +18,6 @@ from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG, show_filtered_argument_or_group_from_help) from vllm.logger import init_logger -from vllm.platforms import current_platform from vllm.usage.usage_lib import UsageContext from vllm.utils import (FlexibleArgumentParser, decorate_logs, get_tcp_uri, set_process_title) @@ -135,42 +133,23 @@ def signal_handler(signum, frame): engine_manager.close() -def _available_device_count() -> int: - return current_platform.device_count() # type: ignore - - -def _estimate_engine_device_count(args: argparse.Namespace) -> int: - tp_size = args.tensor_parallel_size or 1 - pp_size = args.pipeline_parallel_size or 1 - - dp_size = args.data_parallel_size or 1 - if args.data_parallel_size_local: - dp_size = min(dp_size, args.data_parallel_size_local) - - # This is a conservative estimate since it assumes single node - return min(tp_size * pp_size * dp_size, _available_device_count()) - - def run_multi_api_server(args: argparse.Namespace): assert not args.headless num_api_servers: int = args.api_server_count assert num_api_servers > 0 - orig_mm_processor_cache_gb: int = args.mm_processor_cache_gb + # No need to set api_process_rank for EngineCore processes + args.api_process_count = args.api_server_count if num_api_servers > 1: setup_multiprocess_prometheus() - # Not compatible with API server scale-out - args.mm_processor_cache_gb = 0 - listen_address, sock = setup_server(args) engine_args = vllm.AsyncEngineArgs.from_cli_args(args) usage_context = UsageContext.OPENAI_API_SERVER vllm_config = engine_args.create_engine_config(usage_context=usage_context) - model_config = vllm_config.model_config if num_api_servers > 1: if not envs.VLLM_USE_V1: @@ -180,10 +159,6 @@ def run_multi_api_server(args: argparse.Namespace): raise ValueError("VLLM_ALLOW_RUNTIME_LORA_UPDATING cannot be used " "with api_server_count > 1") - if model_config.is_multimodal_model and orig_mm_processor_cache_gb > 0: - logger.warning("Multi-modal processor cache is disabled because " - "it is not compatible with `api_server_count > 1`.") - executor_class = Executor.get_class(vllm_config) log_stats = not engine_args.disable_log_stats @@ -194,54 +169,8 @@ def run_multi_api_server(args: argparse.Namespace): assert external_dp_lb or hybrid_dp_lb or dp_rank == 0 args_per_server = [deepcopy(args) for _ in range(num_api_servers)] - - if mm_processor_kwargs := args.mm_processor_kwargs: - mm_device: str = mm_processor_kwargs.get("device", "cpu") - if mm_device != "cpu": - engine_device_count = _estimate_engine_device_count(args) - available_device_count = _available_device_count() - - engine_gpu_idxs = list(range(engine_device_count)) - - device_type, *rest = mm_device.rsplit(":", 1) - if len(rest) == 0: - # Try to run processing on GPUs that are not used by the engine - processor_gpu_idxs = [ - (engine_device_count + server_idx) % available_device_count - for server_idx in range(num_api_servers) - ] - for server_idx, device_idx in enumerate(processor_gpu_idxs): - device = f"{device_type}:{device_idx}" - args_per_server[server_idx].mm_processor_kwargs["device"] \ - = device - - logger.info( - "Multi-modal processor in APIServer_%s will be run on " - "device %s", server_idx, device) - else: - (device_idx, ) = map(int, rest) - processor_gpu_idxs = [device_idx] * num_api_servers - - processor_engine_gpu_idxs = [ - gpu_idx for gpu_idx in processor_gpu_idxs - if gpu_idx in engine_gpu_idxs - ] - mm_processors_per_gpu = max( - Counter(processor_engine_gpu_idxs).values(), - default=1, - ) - - # NOTE: vllm_config is used to initialize EngineCore while - # args_per_server is used to initialize API processes - vllm_config.model_config.set_mm_processors_per_gpu( - mm_processors_per_gpu) - for server_idx in range(num_api_servers): - args_per_server[server_idx].mm_processors_per_gpu \ - = mm_processors_per_gpu - - logger.info( - "Each GPU is shared by at most %d multi-modal processors", - mm_processors_per_gpu) + for server_idx in range(num_api_servers): + args_per_server[server_idx].api_process_rank = server_idx api_server_manager: Optional[APIServerProcessManager] = None diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 3b01ee7ad4a4..b004fcb74d2e 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -3,6 +3,7 @@ import asyncio import atexit +from collections import Counter from collections.abc import Iterable from concurrent.futures import ThreadPoolExecutor from itertools import groupby @@ -21,6 +22,7 @@ from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather) +from vllm.platforms import current_platform from .audio import AudioMediaIO from .base import MediaIO @@ -333,6 +335,67 @@ def encode_video_base64(frames: npt.NDArray) -> str: return video_io.encode_base64(frames) +def allocate_gpu_mm_processors( + mm_processor_device: str, + mm_processor_count: int, + *, + world_size: int, +) -> tuple[list[str], int]: + """ + Given `--mm_processor_kwargs.device` and the number of multi-modal + processors, return the GPU allocation information. + + Returns: + A tuple `(mm_processor_gpus, mm_processors_per_engine_gpu)`, where: + - `gpu_allocation` is the device to allocate for each + multi-modal processor. + - `mm_processors_per_engine_gpu` is the number of + multi-modal processors allocated to each GPU that is used + by vLLM engine. + """ + available_device_count = current_platform.device_count() # type: ignore + engine_device_count = min(world_size, available_device_count) + + engine_gpu_idxs = list(range(engine_device_count)) + + # In API server scale-out, allocate_gpu_mm_processors is called twice. + # The first call happens in vllm.entrypoints.cli.serve and corresponds + # to len(rest) == 0, resulting in each server targeting a specific device. + # The second call happens in arg_utils.py and corresponds to len(rest) = 1 + device_type, *rest = mm_processor_device.rsplit(":", 1) + if len(rest) == 0: + # Try to run each processor on a different GPU, preferably those + # that are not used by vLLM engine + remaining_count = max(0, available_device_count - engine_device_count) + if remaining_count > 0: + processor_gpu_idxs = [ + engine_device_count + server_idx % remaining_count + for server_idx in range(mm_processor_count) + ] + else: + processor_gpu_idxs = [ + server_idx % available_device_count + for server_idx in range(mm_processor_count) + ] + else: + # Already targeted a specific GPU + (device_idx, ) = map(int, rest) + processor_gpu_idxs = [device_idx] * mm_processor_count + + gpu_allocation = [ + f"{device_type}:{gpu_idx}" for gpu_idx in processor_gpu_idxs + ] + + processor_engine_gpu_idxs = (gpu_idx for gpu_idx in processor_gpu_idxs + if gpu_idx in engine_gpu_idxs) + mm_processors_per_engine_gpu = max( + Counter(processor_engine_gpu_idxs).values(), + default=0, + ) + + return gpu_allocation, mm_processors_per_engine_gpu + + def argsort_mm_positions( mm_positions: MultiModalPlaceholderDict) -> list[tuple[str, int]]: """ diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 291d695e4c87..e0bb88b86932 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2031,7 +2031,9 @@ def maybe_profile_processing(self) -> None: model_config = self.model_config mm_config = model_config.multimodal_config - if mm_config and mm_config.is_mm_processing_gpu: + if mm_config and (mm_config.is_mm_processing_gpu and + (usage_mult := + mm_config.mm_processors_per_engine_gpu) > 0): self.mm_registry.reset_processor_cache(model_config) mm_budget = self.mm_budget @@ -2059,8 +2061,7 @@ def maybe_profile_processing(self) -> None: after_profile = MemorySnapshot(auto_measure=True) diff_profile = after_profile - before_profile - self.processor_memory_usage = diff_profile.torch_peak * ( - mm_config.mm_processors_per_gpu) + self.processor_memory_usage = diff_profile.torch_peak * usage_mult logger.info("Input processing took %.4f GiB and %.6f seconds", self.processor_memory_usage / GiB_bytes, From 794cb4e6d65099560892178dd5b757d5749fecf2 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 12:20:12 +0000 Subject: [PATCH 050/130] Test Signed-off-by: DarkLight1337 --- tests/multimodal/test_utils.py | 139 +++++++++++++++++++++++++++------ vllm/engine/arg_utils.py | 5 +- vllm/multimodal/utils.py | 7 +- 3 files changed, 123 insertions(+), 28 deletions(-) diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index 41f4773a11c8..cb3ea0171c27 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -5,7 +5,6 @@ import mimetypes import os from tempfile import NamedTemporaryFile, TemporaryDirectory -from typing import TYPE_CHECKING, NamedTuple import numpy as np import pytest @@ -19,14 +18,12 @@ initialize_model_parallel) from vllm.multimodal.image import convert_image_mode from vllm.multimodal.inputs import PlaceholderRange -from vllm.multimodal.utils import (MediaConnector, argsort_mm_positions, +from vllm.multimodal.utils import (MediaConnector, allocate_gpu_mm_processors, + argsort_mm_positions, run_dp_sharded_vision_model) from vllm.platforms import current_platform from vllm.utils import get_open_port, update_environment_variables -if TYPE_CHECKING: - from vllm.multimodal.inputs import MultiModalPlaceholderDict - # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) TEST_IMAGE_URLS = [ "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", @@ -176,18 +173,110 @@ async def test_fetch_video_http(video_url: str, num_frames: int): assert metadata_sync == metadata_async -# Used for `test_argsort_mm_positions`. -class TestCase(NamedTuple): - mm_positions: "MultiModalPlaceholderDict" - expected_modality_idxs: list[tuple[str, int]] +# yapf: disable +@pytest.mark.parametrize( + "case", + [ + # Base case + dict( + mm_processor_device="cuda", + mm_processor_count=1, + available_device_count=1, + engine_device_count=1, + expected_gpu_allocation=["cuda:0"], + expected_mm_processors_per_engine_gpu=1, + ), + # Use Engine GPUs + dict( + mm_processor_device="cuda", + mm_processor_count=2, + available_device_count=1, + engine_device_count=1, + expected_gpu_allocation=["cuda:0", "cuda:0"], + expected_mm_processors_per_engine_gpu=2, + ), + dict( + mm_processor_device="cuda", + mm_processor_count=2, + available_device_count=1, + engine_device_count=2, + expected_gpu_allocation=["cuda:0", "cuda:0"], + expected_mm_processors_per_engine_gpu=2, + ), + dict( + mm_processor_device="cuda", + mm_processor_count=2, + available_device_count=2, + engine_device_count=2, + expected_gpu_allocation=["cuda:0", "cuda:1"], + expected_mm_processors_per_engine_gpu=1, + ), + dict( + mm_processor_device="cuda", + mm_processor_count=3, + available_device_count=2, + engine_device_count=2, + expected_gpu_allocation=["cuda:0", "cuda:1", "cuda:0"], + expected_mm_processors_per_engine_gpu=2, + ), + # Use excess GPUs + dict( + mm_processor_device="cuda", + mm_processor_count=2, + available_device_count=3, + engine_device_count=2, + expected_gpu_allocation=["cuda:2", "cuda:2"], + expected_mm_processors_per_engine_gpu=0, + ), + dict( + mm_processor_device="cuda", + mm_processor_count=2, + available_device_count=4, + engine_device_count=2, + expected_gpu_allocation=["cuda:2", "cuda:3"], + expected_mm_processors_per_engine_gpu=0, + ), + dict( + mm_processor_device="cuda", + mm_processor_count=3, + available_device_count=4, + engine_device_count=2, + expected_gpu_allocation=["cuda:2", "cuda:3", "cuda:2"], + expected_mm_processors_per_engine_gpu=0, + ), + ], +) +# yapf: enable +def test_allocate_gpu_mm_processors(case): + mm_processor_device = case["mm_processor_device"] + mm_processor_count = case["mm_processor_count"] + available_device_count = case["available_device_count"] + engine_device_count = case["engine_device_count"] + expected_gpu_allocation = case["expected_gpu_allocation"] + expected_mm_processors_per_engine_gpu = case[ + "expected_mm_processors_per_engine_gpu"] + + ( + gpu_allocation, + mm_processors_per_engine_gpu, + ) = allocate_gpu_mm_processors( + mm_processor_device, + mm_processor_count, + available_device_count=available_device_count, + engine_device_count=engine_device_count, + ) + assert gpu_allocation == expected_gpu_allocation + assert mm_processors_per_engine_gpu == expected_mm_processors_per_engine_gpu -def test_argsort_mm_positions(): - test_cases = [ +# yapf: disable +@pytest.mark.parametrize( + "case", + [ # Single modality ## Internally sorted - TestCase( + dict( mm_positions={ "image": [ PlaceholderRange(offset=0, length=2), @@ -200,7 +289,7 @@ def test_argsort_mm_positions(): ], ), ## Internally unsorted - TestCase( + dict( mm_positions={ "image": [ PlaceholderRange(offset=3, length=2), @@ -215,7 +304,7 @@ def test_argsort_mm_positions(): # Two modalities ## Internally sorted - TestCase( + dict( mm_positions={ "image": [ PlaceholderRange(offset=7, length=4), @@ -234,7 +323,7 @@ def test_argsort_mm_positions(): ], ), ## Interleaved, internally sorted - TestCase( + dict( mm_positions={ "image": [ PlaceholderRange(offset=0, length=4), @@ -253,7 +342,7 @@ def test_argsort_mm_positions(): ], ), ## Interleaved, internally unsorted - TestCase( + dict( mm_positions={ "image": [ PlaceholderRange(offset=8, length=2), @@ -274,7 +363,7 @@ def test_argsort_mm_positions(): # Three modalities ## Internally sorted - TestCase( + dict( mm_positions={ "image": [ PlaceholderRange(offset=15, length=7), @@ -299,7 +388,7 @@ def test_argsort_mm_positions(): ], ), ## Interleaved, internally sorted - TestCase( + dict( mm_positions={ "image": [ PlaceholderRange(offset=0, length=2), @@ -322,7 +411,7 @@ def test_argsort_mm_positions(): ], ), ## Interleaved, internally sunorted - TestCase( + dict( mm_positions={ "image": [ PlaceholderRange(offset=0, length=2), @@ -344,12 +433,16 @@ def test_argsort_mm_positions(): ("image", 1), ], ), - ] + ], +) +# yapf: enable +def test_argsort_mm_positions(case): + mm_positions = case["mm_positions"] + expected_modality_idxs = case["expected_modality_idxs"] - for mm_positions, expected_modality_idxs in test_cases: - modality_idxs = argsort_mm_positions(mm_positions) + modality_idxs = argsort_mm_positions(mm_positions) - assert modality_idxs == expected_modality_idxs + assert modality_idxs == expected_modality_idxs class SimpleLinearModel(torch.nn.Module): diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index e7ac8aea7059..d1f1813589fb 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1268,7 +1268,10 @@ def create_engine_config( ) = allocate_gpu_mm_processors( mm_processor_device, self.api_process_count, - world_size=parallel_config.world_size_across_dp, + available_device_count=current_platform.device_count( + ), # type: ignore + engine_device_count=parallel_config. + world_size_across_dp, ) new_device = gpu_allocation[self.api_process_rank] diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index b004fcb74d2e..27015112aa2b 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -22,7 +22,6 @@ from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather) -from vllm.platforms import current_platform from .audio import AudioMediaIO from .base import MediaIO @@ -339,7 +338,8 @@ def allocate_gpu_mm_processors( mm_processor_device: str, mm_processor_count: int, *, - world_size: int, + available_device_count: int, + engine_device_count: int, ) -> tuple[list[str], int]: """ Given `--mm_processor_kwargs.device` and the number of multi-modal @@ -353,8 +353,7 @@ def allocate_gpu_mm_processors( multi-modal processors allocated to each GPU that is used by vLLM engine. """ - available_device_count = current_platform.device_count() # type: ignore - engine_device_count = min(world_size, available_device_count) + engine_device_count = min(engine_device_count, available_device_count) engine_gpu_idxs = list(range(engine_device_count)) From 07f0f1f11a70a2f4d0a7c3cc7a4037c1c64adb36 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 12:39:24 +0000 Subject: [PATCH 051/130] Fix typo Signed-off-by: DarkLight1337 --- tests/multimodal/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index cb3ea0171c27..7f04d6a49924 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -410,7 +410,7 @@ def test_allocate_gpu_mm_processors(case): ("image", 2), ], ), - ## Interleaved, internally sunorted + ## Interleaved, internally unsorted dict( mm_positions={ "image": [ From bce21a28cfe998a2b3842af0ac01ba667a77289d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 12:40:55 +0000 Subject: [PATCH 052/130] Fix arg Signed-off-by: DarkLight1337 --- vllm/engine/arg_utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d1f1813589fb..56db60cd87c3 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -714,9 +714,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: multimodal_group.add_argument( "--mm-processor-cache-gb", **multimodal_kwargs["mm_processor_cache_gb"]) - multimodal_group.add_argument( - "--mm_processors-per-gpu", - **multimodal_kwargs["mm_processors_per_engine_gpu"]) multimodal_group.add_argument("--disable-mm-preprocessor-cache", type=bool, deprecated=True) @@ -932,6 +929,7 @@ def create_model_config(self) -> ModelConfig: config_format=self.config_format, mm_processor_kwargs=self.mm_processor_kwargs, mm_processor_cache_gb=self.mm_processor_cache_gb, + mm_processors_per_engine_gpu=self.mm_processors_per_engine_gpu, override_neuron_config=self.override_neuron_config, override_pooler_config=self.override_pooler_config, logits_processor_pattern=self.logits_processor_pattern, From 61d542289e7a1e9b3d1372f431e907b1035ddf22 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 12:43:03 +0000 Subject: [PATCH 053/130] Simplify Signed-off-by: DarkLight1337 --- vllm/utils/__init__.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index de0ab0cca4c8..3d538c62b850 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -2630,8 +2630,8 @@ class MemoryProfilingResult: def __repr__(self) -> str: summary = f"Memory profiling takes {self.profile_time:.2f} seconds." - detail = list[str]() - for title, value in [ + detail = [ + f"{title}: {value / GiB_bytes:.2f}GiB" for title, value in [ "Total non KV cache memory", self.non_kv_cache_memory, "torch peak memory increase", @@ -2642,9 +2642,8 @@ def __repr__(self) -> str: self.weights_memory, "input processing memory", self.processing_memory, - ]: - if value > 0: - detail.append(f"{title}: {value / GiB_bytes:.2f}GiB") + ] + ] return f"{summary} {'; '.join(detail)}." From 153d971bf4c4b88079ffdb1b97158f2012bb9e48 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 13:48:46 +0000 Subject: [PATCH 054/130] Update Signed-off-by: DarkLight1337 --- tests/multimodal/test_utils.py | 33 +++++++----- vllm/config/__init__.py | 21 ++++---- vllm/engine/arg_utils.py | 11 ++-- vllm/multimodal/utils.py | 21 +++----- vllm/utils/__init__.py | 34 ++++++++---- vllm/v1/worker/gpu_model_runner.py | 83 +++++++++++++++++------------- vllm/v1/worker/gpu_worker.py | 21 +++----- vllm/v1/worker/utils.py | 28 +++++++++- 8 files changed, 146 insertions(+), 106 deletions(-) diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index 7f04d6a49924..16ec3fbe66a7 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -177,14 +177,22 @@ async def test_fetch_video_http(video_url: str, num_frames: int): @pytest.mark.parametrize( "case", [ - # Base case + # Basic + dict( + mm_processor_device="cuda", + mm_processor_count=0, + available_device_count=1, + engine_device_count=1, + expected_gpu_allocation=[], + expected_mm_processors_per_gpu=0, + ), dict( mm_processor_device="cuda", mm_processor_count=1, available_device_count=1, engine_device_count=1, expected_gpu_allocation=["cuda:0"], - expected_mm_processors_per_engine_gpu=1, + expected_mm_processors_per_gpu=1, ), # Use Engine GPUs dict( @@ -193,7 +201,7 @@ async def test_fetch_video_http(video_url: str, num_frames: int): available_device_count=1, engine_device_count=1, expected_gpu_allocation=["cuda:0", "cuda:0"], - expected_mm_processors_per_engine_gpu=2, + expected_mm_processors_per_gpu=2, ), dict( mm_processor_device="cuda", @@ -201,7 +209,7 @@ async def test_fetch_video_http(video_url: str, num_frames: int): available_device_count=1, engine_device_count=2, expected_gpu_allocation=["cuda:0", "cuda:0"], - expected_mm_processors_per_engine_gpu=2, + expected_mm_processors_per_gpu=2, ), dict( mm_processor_device="cuda", @@ -209,7 +217,7 @@ async def test_fetch_video_http(video_url: str, num_frames: int): available_device_count=2, engine_device_count=2, expected_gpu_allocation=["cuda:0", "cuda:1"], - expected_mm_processors_per_engine_gpu=1, + expected_mm_processors_per_gpu=1, ), dict( mm_processor_device="cuda", @@ -217,7 +225,7 @@ async def test_fetch_video_http(video_url: str, num_frames: int): available_device_count=2, engine_device_count=2, expected_gpu_allocation=["cuda:0", "cuda:1", "cuda:0"], - expected_mm_processors_per_engine_gpu=2, + expected_mm_processors_per_gpu=2, ), # Use excess GPUs dict( @@ -226,7 +234,7 @@ async def test_fetch_video_http(video_url: str, num_frames: int): available_device_count=3, engine_device_count=2, expected_gpu_allocation=["cuda:2", "cuda:2"], - expected_mm_processors_per_engine_gpu=0, + expected_mm_processors_per_gpu=2, ), dict( mm_processor_device="cuda", @@ -234,7 +242,7 @@ async def test_fetch_video_http(video_url: str, num_frames: int): available_device_count=4, engine_device_count=2, expected_gpu_allocation=["cuda:2", "cuda:3"], - expected_mm_processors_per_engine_gpu=0, + expected_mm_processors_per_gpu=1, ), dict( mm_processor_device="cuda", @@ -242,7 +250,7 @@ async def test_fetch_video_http(video_url: str, num_frames: int): available_device_count=4, engine_device_count=2, expected_gpu_allocation=["cuda:2", "cuda:3", "cuda:2"], - expected_mm_processors_per_engine_gpu=0, + expected_mm_processors_per_gpu=2, ), ], ) @@ -253,12 +261,11 @@ def test_allocate_gpu_mm_processors(case): available_device_count = case["available_device_count"] engine_device_count = case["engine_device_count"] expected_gpu_allocation = case["expected_gpu_allocation"] - expected_mm_processors_per_engine_gpu = case[ - "expected_mm_processors_per_engine_gpu"] + expected_mm_processors_per_gpu = case["expected_mm_processors_per_gpu"] ( gpu_allocation, - mm_processors_per_engine_gpu, + mm_processors_per_gpu, ) = allocate_gpu_mm_processors( mm_processor_device, mm_processor_count, @@ -267,7 +274,7 @@ def test_allocate_gpu_mm_processors(case): ) assert gpu_allocation == expected_gpu_allocation - assert mm_processors_per_engine_gpu == expected_mm_processors_per_engine_gpu + assert mm_processors_per_gpu == expected_mm_processors_per_gpu # yapf: disable diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 974fee64bd9d..025b1aef30cb 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -422,14 +422,12 @@ class ModelConfig: `mm_processor_cache_gb * (api_server_count + data_parallel_size)`. Set to `0` to disable this cache completely (not recommended).""" - mm_processors_per_engine_gpu: int = 0 + mm_processors_per_gpu: int = 0 """ - [Internal] The maximum number of multi-modal processors that use each GPU - in vLLM engine. + [Internal] The maximum number of multi-modal processors that use each GPU. This is needed to determine the peak memory of multi-modal processing - in the case of API server scale-out. - """ + in the case of API server scale-out.""" override_neuron_config: dict[str, Any] = field(default_factory=dict) """Initialize non-default neuron config or override default neuron config that are specific to Neuron devices, this argument will be used to @@ -867,11 +865,11 @@ def set_mm_processor_kwargs(self, value: dict[str, Any]) -> None: mm_config.mm_processor_kwargs.update(value) - def set_mm_processors_per_engine_gpu(self, value: int) -> None: - self.mm_processors_per_engine_gpu = value + def set_mm_processors_per_gpu(self, value: int) -> None: + self.mm_processors_per_gpu = value if mm_config := self.multimodal_config: - mm_config.mm_processors_per_engine_gpu = value + mm_config.mm_processors_per_gpu = value def _get_encoder_config(self): return get_sentence_transformer_tokenizer_config( @@ -2510,7 +2508,7 @@ class MultiModalConfig: For example, to set num_frames for video, set `--media-io-kwargs '{"video": {"num_frames": 40} }'` """ - mm_processor_kwargs: Optional[dict[str, object]] = None + mm_processor_kwargs: Optional[dict[str, Any]] = None """ Overrides for the multi-modal processor obtained from `transformers.AutoProcessor.from_pretrained`. @@ -2532,10 +2530,9 @@ class MultiModalConfig: Set to `0` to disable this cache completely (not recommended). """ - mm_processors_per_engine_gpu: int = -1 + mm_processors_per_gpu: int = 0 """ - [Internal] The maximum number of multi-modal processors that use each GPU - in vLLM engine. A value of `-1` means not set. + [Internal] The maximum number of multi-modal processors that use each GPU. This is needed to determine the peak memory of multi-modal processing in the case of API server scale-out. diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 56db60cd87c3..fe6417125097 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -352,8 +352,7 @@ class EngineArgs: MultiModalConfig.mm_processor_kwargs disable_mm_preprocessor_cache: bool = False # DEPRECATED mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb - mm_processors_per_engine_gpu: int = \ - MultiModalConfig.mm_processors_per_engine_gpu + mm_processors_per_gpu: int = MultiModalConfig.mm_processors_per_gpu # LoRA fields enable_lora: bool = False enable_lora_bias: bool = LoRAConfig.bias_enabled @@ -929,7 +928,7 @@ def create_model_config(self) -> ModelConfig: config_format=self.config_format, mm_processor_kwargs=self.mm_processor_kwargs, mm_processor_cache_gb=self.mm_processor_cache_gb, - mm_processors_per_engine_gpu=self.mm_processors_per_engine_gpu, + mm_processors_per_gpu=self.mm_processors_per_gpu, override_neuron_config=self.override_neuron_config, override_pooler_config=self.override_pooler_config, logits_processor_pattern=self.logits_processor_pattern, @@ -1262,7 +1261,7 @@ def create_engine_config( if mm_processor_device != "cpu": ( gpu_allocation, - mm_processors_per_engine_gpu, + mm_processors_per_gpu, ) = allocate_gpu_mm_processors( mm_processor_device, self.api_process_count, @@ -1279,8 +1278,8 @@ def create_engine_config( model_config.set_mm_processor_kwargs( {"device": new_device}) - model_config.set_mm_processors_per_engine_gpu( - mm_processors_per_engine_gpu) + model_config.set_mm_processors_per_gpu( + mm_processors_per_gpu) speculative_config = self.create_speculative_config( target_model_config=model_config, diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 27015112aa2b..3f598b765100 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -346,17 +346,13 @@ def allocate_gpu_mm_processors( processors, return the GPU allocation information. Returns: - A tuple `(mm_processor_gpus, mm_processors_per_engine_gpu)`, where: + A tuple `(mm_processor_gpus, mm_processors_per_gpu)`, where: - `gpu_allocation` is the device to allocate for each multi-modal processor. - - `mm_processors_per_engine_gpu` is the number of + - `mm_processors_per_gpu` is the number of multi-modal processors allocated to each GPU that is used by vLLM engine. """ - engine_device_count = min(engine_device_count, available_device_count) - - engine_gpu_idxs = list(range(engine_device_count)) - # In API server scale-out, allocate_gpu_mm_processors is called twice. # The first call happens in vllm.entrypoints.cli.serve and corresponds # to len(rest) == 0, resulting in each server targeting a specific device. @@ -365,8 +361,8 @@ def allocate_gpu_mm_processors( if len(rest) == 0: # Try to run each processor on a different GPU, preferably those # that are not used by vLLM engine - remaining_count = max(0, available_device_count - engine_device_count) - if remaining_count > 0: + if available_device_count > engine_device_count: + remaining_count = available_device_count - engine_device_count processor_gpu_idxs = [ engine_device_count + server_idx % remaining_count for server_idx in range(mm_processor_count) @@ -384,15 +380,12 @@ def allocate_gpu_mm_processors( gpu_allocation = [ f"{device_type}:{gpu_idx}" for gpu_idx in processor_gpu_idxs ] - - processor_engine_gpu_idxs = (gpu_idx for gpu_idx in processor_gpu_idxs - if gpu_idx in engine_gpu_idxs) - mm_processors_per_engine_gpu = max( - Counter(processor_engine_gpu_idxs).values(), + mm_processors_per_gpu = max( + Counter(processor_gpu_idxs).values(), default=0, ) - return gpu_allocation, mm_processors_per_engine_gpu + return gpu_allocation, mm_processors_per_gpu def argsort_mm_positions( diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 3d538c62b850..85571ad5398d 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -2574,33 +2574,42 @@ class MemorySnapshot: torch_memory: int = 0 non_torch_memory: int = 0 timestamp: float = 0.0 + + device: torch.types.Device = None auto_measure: bool = True - def __post_init__(self): + def __post_init__(self) -> None: if self.auto_measure: self.measure() - def measure(self): + def measure(self) -> None: + device = self.device + # we measure the torch peak memory usage via allocated_bytes, # rather than `torch.cuda.memory_reserved()` . # After `torch.cuda.reset_peak_memory_stats()`, # `torch.cuda.memory_reserved()` will keep growing, and only shrink # when we call `torch.cuda.empty_cache()` or OOM happens. - self.torch_peak = torch.cuda.memory_stats().get( + self.torch_peak = torch.cuda.memory_stats(device).get( "allocated_bytes.all.peak", 0) - self.free_memory, self.total_memory = torch.cuda.mem_get_info() + self.free_memory, self.total_memory = torch.cuda.mem_get_info(device) self.cuda_memory = self.total_memory - self.free_memory # torch.cuda.memory_reserved() is how many bytes # PyTorch gets from cuda (by calling cudaMalloc, etc.) # this is used to measure the non-torch memory usage - self.torch_memory = torch.cuda.memory_reserved() + self.torch_memory = torch.cuda.memory_reserved(device) self.non_torch_memory = self.cuda_memory - self.torch_memory self.timestamp = time.time() def __sub__(self, other: MemorySnapshot) -> MemorySnapshot: + if self.device != other.device: + raise ValueError( + "The two snapshots should be from the same device! " + f"Found: {self.device} vs. {other.device}") + return MemorySnapshot( torch_peak=self.torch_peak - other.torch_peak, free_memory=self.free_memory - other.free_memory, @@ -2609,6 +2618,7 @@ def __sub__(self, other: MemorySnapshot) -> MemorySnapshot: torch_memory=self.torch_memory - other.torch_memory, non_torch_memory=self.non_torch_memory - other.non_torch_memory, timestamp=self.timestamp - other.timestamp, + device=self.device, auto_measure=False, ) @@ -2620,13 +2630,17 @@ class MemoryProfilingResult: non_kv_cache_memory: int = 0 torch_peak_increase: int = 0 non_torch_increase: int = 0 - weights_memory: float = 0 + weights_memory: int = 0 processing_memory: int = 0 before_create: MemorySnapshot = field(default_factory=MemorySnapshot) - before_profile: MemorySnapshot = field(default_factory=MemorySnapshot) - after_profile: MemorySnapshot = field(default_factory=MemorySnapshot) profile_time: float = 0.0 + def __post_init__(self) -> None: + device = self.before_create.device + + self.before_profile = MemorySnapshot(device=device, auto_measure=False) + self.after_profile = MemorySnapshot(device=device, auto_measure=False) + def __repr__(self) -> str: summary = f"Memory profiling takes {self.profile_time:.2f} seconds." @@ -2651,8 +2665,8 @@ def __repr__(self) -> str: @contextlib.contextmanager def memory_profiling( baseline_snapshot: MemorySnapshot, - weights_memory: int, *, + weights_memory: int = 0, processing_memory: int = 0, ) -> Generator[MemoryProfilingResult, None, None]: """Memory profiling context manager. @@ -2703,7 +2717,7 @@ def memory_profiling( """ # noqa gc.collect() torch.cuda.empty_cache() - torch.cuda.reset_peak_memory_stats() + torch.cuda.reset_peak_memory_stats(device=baseline_snapshot.device) result = MemoryProfilingResult( before_create=baseline_snapshot, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e0bb88b86932..d3d341df7d19 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -49,8 +49,8 @@ from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, GiB_bytes, LazyLoader, MemorySnapshot, check_use_alibi, - get_dtype_size, is_pin_memory_available, round_up, - supports_dynamo) + get_dtype_size, is_pin_memory_available, + memory_profiling, round_up, supports_dynamo) from vllm.v1.attention.backends.mamba_selectors import get_mamba_attn_backend from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, @@ -78,7 +78,8 @@ from ..sample.logits_processor import LogitsProcessorManager from .utils import (AttentionGroup, MultiModalBudget, bind_kv_cache, - gather_mm_placeholders, initialize_kv_cache_for_kv_sharing, + check_enough_init_memory, gather_mm_placeholders, + initialize_kv_cache_for_kv_sharing, sanity_check_mm_encoder_outputs, scatter_mm_placeholders) if TYPE_CHECKING: @@ -2031,43 +2032,53 @@ def maybe_profile_processing(self) -> None: model_config = self.model_config mm_config = model_config.multimodal_config - if mm_config and (mm_config.is_mm_processing_gpu and - (usage_mult := - mm_config.mm_processors_per_engine_gpu) > 0): - self.mm_registry.reset_processor_cache(model_config) - - mm_budget = self.mm_budget - assert mm_budget is not None + processor_memory_usage = 0 + + if mm_config and (mm_processor_kwargs := + mm_config.mm_processor_kwargs): + mm_processor_device = torch.device( + mm_processor_kwargs.get("device", "cpu")) + device_mult = mm_config.mm_processors_per_gpu + + if mm_processor_device != "cpu" and device_mult > 0: + mm_budget = self.mm_budget + assert mm_budget is not None + + self.mm_registry.reset_processor_cache(model_config) + + baseline_snapshot = MemorySnapshot(device=mm_processor_device) + if mm_processor_device != self.device: + check_enough_init_memory(baseline_snapshot, + self.cache_config) + + with memory_profiling(baseline_snapshot) as diff: + for modality, max_items_per_prompt in ( + mm_budget.max_items_per_prompt_by_modality.items() + ): + self.mm_registry.get_decoder_dummy_data( + model_config=model_config, + seq_len=self.max_num_tokens, + mm_counts={modality: max_items_per_prompt}, + ) - gc.collect() - torch.cuda.empty_cache() - torch.cuda.reset_peak_memory_stats() - - time_before_processing = time.perf_counter() - before_profile = MemorySnapshot(auto_measure=True) - - for modality, max_items_per_prompt in ( - mm_budget.max_items_per_prompt_by_modality.items()): - self.mm_registry.get_decoder_dummy_data( - model_config=model_config, - seq_len=self.max_num_tokens, - mm_counts={modality: max_items_per_prompt}, + processor_memory_usage = diff.torch_peak_increase * device_mult + logger.info( + "Input processing took %.4f GiB and %.6f seconds on %s", + processor_memory_usage / GiB_bytes, + diff.profile_time, + mm_processor_device, ) + if processor_memory_usage > diff.before_profile.free_memory: + raise ValueError( + f"No available memory in {mm_processor_device} " + f"for multi-modal processing. " + f"Try increasing `gpu_memory_utilization` " + f"or reduce `api_server_count`.") - gc.collect() - torch.cuda.empty_cache() - - time_after_processing = time.perf_counter() - after_profile = MemorySnapshot(auto_measure=True) + if mm_processor_device != self.device: + processor_memory_usage = 0 # Not on the engine GPU - diff_profile = after_profile - before_profile - self.processor_memory_usage = diff_profile.torch_peak * usage_mult - - logger.info("Input processing took %.4f GiB and %.6f seconds", - self.processor_memory_usage / GiB_bytes, - time_after_processing - time_before_processing) - else: - self.processor_memory_usage = 0 + self.processor_memory_usage = processor_memory_usage def reload_weights(self) -> None: assert getattr(self, "model", None) is not None, \ diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 9558365e51ed..5d8398174020 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -33,6 +33,8 @@ from vllm.v1.worker.gpu_model_runner import GPUModelRunner from vllm.v1.worker.worker_base import WorkerBase +from .utils import check_enough_init_memory + logger = init_logger(__name__) if TYPE_CHECKING: @@ -171,20 +173,11 @@ def init_device(self): torch.cuda.empty_cache() # take current memory snapshot - self.init_snapshot = MemorySnapshot() - self.requested_memory = (self.init_snapshot.total_memory * - self.cache_config.gpu_memory_utilization) - if self.init_snapshot.free_memory < self.requested_memory: - GiB = lambda b: round(b / GiB_bytes, 2) - raise ValueError( - f"Free memory on device " - f"({GiB(self.init_snapshot.free_memory)}/" - f"{GiB(self.init_snapshot.total_memory)} GiB) on startup " - f"is less than desired GPU memory utilization " - f"({self.cache_config.gpu_memory_utilization}, " - f"{GiB(self.requested_memory)} GiB). Decrease GPU memory " - f"utilization or reduce GPU memory used by other processes." - ) + self.init_snapshot = MemorySnapshot(device=self.device) + self.requested_memory = check_enough_init_memory( + self.init_snapshot, + self.cache_config, + ) else: raise RuntimeError( f"Not support device type: {self.device_config.device}") diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index e7079235d651..3252a5565a6c 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -7,10 +7,11 @@ import torch from vllm.attention.backends.abstract import AttentionBackend -from vllm.config import ModelConfig, SchedulerConfig +from vllm.config import CacheConfig, ModelConfig, SchedulerConfig from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.utils import extract_layer_index from vllm.multimodal.registry import MultiModalRegistry +from vllm.utils import GiB_bytes, MemorySnapshot from vllm.v1.attention.backends.utils import AttentionMetadataBuilder from vllm.v1.core.encoder_cache_manager import compute_encoder_budget from vllm.v1.kv_cache_interface import KVCacheGroupSpec @@ -202,6 +203,31 @@ def gather_mm_placeholders( return placeholders[is_embed] +def check_enough_init_memory( + init_snapshot: MemorySnapshot, + cache_config: CacheConfig, +) -> float: + """ + Calculate the amount of memory required by vLLM, then validate + that the current amount of free memory is sufficient for that. + """ + requested_memory = init_snapshot.total_memory * ( + cache_config.gpu_memory_utilization) + + if init_snapshot.free_memory < requested_memory: + GiB = lambda b: round(b / GiB_bytes, 2) + raise ValueError( + f"Free memory on device {init_snapshot.device} " + f"({GiB(init_snapshot.free_memory)}/" + f"{GiB(init_snapshot.total_memory)} GiB) on startup " + f"is less than desired GPU memory utilization " + f"({cache_config.gpu_memory_utilization}, " + f"{GiB(requested_memory)} GiB). Decrease GPU memory " + f"utilization or reduce GPU memory used by other processes.") + + return requested_memory + + def initialize_kv_cache_for_kv_sharing( shared_kv_cache_layers: dict[str, str], kv_cache_groups: list[KVCacheGroupSpec], From e50ef024f38b4ceba1bef960f1f529dd4968e631 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 14:19:44 +0000 Subject: [PATCH 055/130] Run profiling inside processor Signed-off-by: DarkLight1337 --- vllm/utils/__init__.py | 34 ++++----------- vllm/v1/engine/processor.py | 61 ++++++++++++++++++++++++++ vllm/v1/worker/gpu_model_runner.py | 69 ++---------------------------- vllm/v1/worker/gpu_worker.py | 1 - vllm/v1/worker/tpu_model_runner.py | 2 - vllm/v1/worker/utils.py | 8 ++-- 6 files changed, 78 insertions(+), 97 deletions(-) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 85571ad5398d..50df21872b81 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -2631,7 +2631,6 @@ class MemoryProfilingResult: torch_peak_increase: int = 0 non_torch_increase: int = 0 weights_memory: int = 0 - processing_memory: int = 0 before_create: MemorySnapshot = field(default_factory=MemorySnapshot) profile_time: float = 0.0 @@ -2642,32 +2641,20 @@ def __post_init__(self) -> None: self.after_profile = MemorySnapshot(device=device, auto_measure=False) def __repr__(self) -> str: - summary = f"Memory profiling takes {self.profile_time:.2f} seconds." - - detail = [ - f"{title}: {value / GiB_bytes:.2f}GiB" for title, value in [ - "Total non KV cache memory", - self.non_kv_cache_memory, - "torch peak memory increase", - self.torch_peak_increase, - "non-torch forward increase memory", - self.non_torch_increase, - "model weights memory", - self.weights_memory, - "input processing memory", - self.processing_memory, - ] - ] - - return f"{summary} {'; '.join(detail)}." + return (f"Memory profiling takes {self.profile_time:.2f} seconds. " + f"Total non KV cache memory: " + f"{(self.non_kv_cache_memory / GiB_bytes):.2f}GiB; " + f"torch peak memory increase: " + f"{(self.torch_peak_increase / GiB_bytes):.2f}GiB; " + f"non-torch forward increase memory: " + f"{(self.non_torch_increase / GiB_bytes):.2f}GiB; " + f"weights memory: {(self.weights_memory / GiB_bytes):.2f}GiB.") @contextlib.contextmanager def memory_profiling( baseline_snapshot: MemorySnapshot, - *, weights_memory: int = 0, - processing_memory: int = 0, ) -> Generator[MemoryProfilingResult, None, None]: """Memory profiling context manager. baseline_snapshot: the memory snapshot before the current vLLM instance. @@ -2723,8 +2710,6 @@ def memory_profiling( before_create=baseline_snapshot, # the part of memory used for holding the model weights weights_memory=weights_memory, - # the part of memory used for input processing - processing_memory=processing_memory, ) result.before_profile.measure() @@ -2743,8 +2728,7 @@ def memory_profiling( result.profile_time = diff_profile.timestamp result.non_kv_cache_memory = (result.non_torch_increase + result.torch_peak_increase + - result.weights_memory + - result.processing_memory) + result.weights_memory) # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501 diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 376c76a7e728..0c4eb32496a4 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -5,10 +5,13 @@ from collections.abc import Mapping from typing import Any, Literal, Optional, Union +import torch + from vllm.config import VllmConfig from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs from vllm.inputs.parse import split_enc_dec_inputs from vllm.inputs.preprocess import InputPreprocessor +from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange @@ -17,6 +20,7 @@ from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import TokenizerGroup +from vllm.utils import GiB_bytes, MemorySnapshot, memory_profiling from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.mm_input_cache import MultiModalInputCacheClient from vllm.v1.structured_output.backend_guidance import ( @@ -25,6 +29,9 @@ validate_structured_output_request_outlines) from vllm.v1.structured_output.backend_xgrammar import ( validate_xgrammar_grammar) +from vllm.v1.worker.utils import MultiModalBudget, check_enough_init_memory + +logger = init_logger(__name__) class Processor: @@ -40,6 +47,7 @@ def __init__( self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config self.lora_config = vllm_config.lora_config + self.scheduler_config = vllm_config.scheduler_config self.decoding_config = vllm_config.decoding_config self.tokenizer = tokenizer @@ -52,6 +60,8 @@ def __init__( self.mm_input_cache_client = MultiModalInputCacheClient( self.model_config, mm_registry) + self.profile_run() + @property def mm_registry(self): return self.input_preprocessor.mm_registry @@ -414,3 +424,54 @@ def _validate_model_input( # TODO: Find out how many placeholder tokens are there so we can # check that chunked prefill does not truncate them # max_batch_len = self.scheduler_config.max_num_batched_tokens + + def profile_run(self) -> None: + model_config = self.model_config + mm_config = model_config.multimodal_config + + processor_memory_usage = 0 + + if mm_config and (mm_processor_kwargs := + mm_config.mm_processor_kwargs): + mm_processor_device = torch.device( + mm_processor_kwargs.get("device", "cpu")) + device_mult = mm_config.mm_processors_per_gpu + + if mm_processor_device != "cpu" and device_mult > 0: + scheduler_config = self.scheduler_config + mm_budget = MultiModalBudget( + model_config, + scheduler_config, + self.mm_registry, + ) + + self.mm_registry.reset_processor_cache(model_config) + + baseline_snapshot = MemorySnapshot(device=mm_processor_device) + check_enough_init_memory(baseline_snapshot, self.cache_config) + + with memory_profiling(baseline_snapshot) as diff: + for modality, max_items_per_prompt in ( + mm_budget.max_items_per_prompt_by_modality.items() + ): + self.mm_registry.get_decoder_dummy_data( + model_config=model_config, + seq_len=scheduler_config.max_num_batched_tokens, + mm_counts={modality: max_items_per_prompt}, + ) + + processor_memory_usage = diff.torch_peak_increase * device_mult + logger.info( + "Input processing took %.4f GiB and %.6f seconds on %s", + processor_memory_usage / GiB_bytes, + diff.profile_time, + mm_processor_device, + ) + if processor_memory_usage > diff.before_profile.free_memory: + raise ValueError( + f"No available memory in {mm_processor_device} " + f"for multi-modal processing. " + f"Try increasing `gpu_memory_utilization` " + f"or reduce `api_server_count`.") + + self.processor_memory_usage = processor_memory_usage diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d3d341df7d19..0eb327df4124 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -48,9 +48,8 @@ from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, - GiB_bytes, LazyLoader, MemorySnapshot, check_use_alibi, - get_dtype_size, is_pin_memory_available, - memory_profiling, round_up, supports_dynamo) + GiB_bytes, LazyLoader, check_use_alibi, get_dtype_size, + is_pin_memory_available, round_up, supports_dynamo) from vllm.v1.attention.backends.mamba_selectors import get_mamba_attn_backend from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, @@ -78,8 +77,7 @@ from ..sample.logits_processor import LogitsProcessorManager from .utils import (AttentionGroup, MultiModalBudget, bind_kv_cache, - check_enough_init_memory, gather_mm_placeholders, - initialize_kv_cache_for_kv_sharing, + gather_mm_placeholders, initialize_kv_cache_for_kv_sharing, sanity_check_mm_encoder_outputs, scatter_mm_placeholders) if TYPE_CHECKING: @@ -332,10 +330,7 @@ def __init__( self.model_config, self.scheduler_config, self.mm_registry, - max_model_len=self.max_model_len, - max_num_reqs=self.max_num_reqs, - ) if self.supports_mm_inputs \ - else None) + ) if self.supports_mm_inputs else None) self.reorder_batch_threshold: Optional[int] = None @@ -2024,62 +2019,6 @@ def load_model(self, eep_scale_up: bool = False) -> None: fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, backend=backend) - # We don't call this from the worker because - # not all model runners support this - self.maybe_profile_processing() - - def maybe_profile_processing(self) -> None: - model_config = self.model_config - mm_config = model_config.multimodal_config - - processor_memory_usage = 0 - - if mm_config and (mm_processor_kwargs := - mm_config.mm_processor_kwargs): - mm_processor_device = torch.device( - mm_processor_kwargs.get("device", "cpu")) - device_mult = mm_config.mm_processors_per_gpu - - if mm_processor_device != "cpu" and device_mult > 0: - mm_budget = self.mm_budget - assert mm_budget is not None - - self.mm_registry.reset_processor_cache(model_config) - - baseline_snapshot = MemorySnapshot(device=mm_processor_device) - if mm_processor_device != self.device: - check_enough_init_memory(baseline_snapshot, - self.cache_config) - - with memory_profiling(baseline_snapshot) as diff: - for modality, max_items_per_prompt in ( - mm_budget.max_items_per_prompt_by_modality.items() - ): - self.mm_registry.get_decoder_dummy_data( - model_config=model_config, - seq_len=self.max_num_tokens, - mm_counts={modality: max_items_per_prompt}, - ) - - processor_memory_usage = diff.torch_peak_increase * device_mult - logger.info( - "Input processing took %.4f GiB and %.6f seconds on %s", - processor_memory_usage / GiB_bytes, - diff.profile_time, - mm_processor_device, - ) - if processor_memory_usage > diff.before_profile.free_memory: - raise ValueError( - f"No available memory in {mm_processor_device} " - f"for multi-modal processing. " - f"Try increasing `gpu_memory_utilization` " - f"or reduce `api_server_count`.") - - if mm_processor_device != self.device: - processor_memory_usage = 0 # Not on the engine GPU - - self.processor_memory_usage = processor_memory_usage - def reload_weights(self) -> None: assert getattr(self, "model", None) is not None, \ "Cannot reload weights before model is loaded." diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 5d8398174020..f198fa2e12f0 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -233,7 +233,6 @@ def determine_available_memory(self) -> int: with memory_profiling( self.init_snapshot, weights_memory=int(self.model_runner.model_memory_usage), - processing_memory=self.model_runner.processor_memory_usage, ) as profile_result: self.model_runner.profile_run() diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 46262284e333..7a209599916d 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -292,8 +292,6 @@ def __init__( self.model_config, self.scheduler_config, self.mm_registry, - max_model_len=self.max_model_len, - max_num_reqs=self.max_num_reqs, ) if self.supports_mm_inputs else None) if not self.use_spmd: diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 3252a5565a6c..aa82806da528 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -29,8 +29,8 @@ def __init__( scheduler_config: SchedulerConfig, mm_registry: MultiModalRegistry, *, - max_model_len: int, - max_num_reqs: int, + max_model_len: Optional[int] = None, + max_num_reqs: Optional[int] = None, ) -> None: super().__init__() @@ -46,8 +46,8 @@ def __init__( self.max_num_encoder_input_tokens = encoder_compute_budget self.encoder_cache_size = encoder_cache_size - self.max_model_len = max_model_len - self.max_num_reqs = max_num_reqs + self.max_model_len = max_model_len or model_config.max_model_len + self.max_num_reqs = max_num_reqs or scheduler_config.max_num_seqs self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config) From 72b5d94c3a031ee070c180fa0445301c924b02f3 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 14:27:51 +0000 Subject: [PATCH 056/130] Deprecate Signed-off-by: DarkLight1337 --- vllm/v1/worker/utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index aa82806da528..010498818075 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import warnings from collections import defaultdict from dataclasses import dataclass from typing import TYPE_CHECKING, Optional @@ -34,6 +35,13 @@ def __init__( ) -> None: super().__init__() + if max_model_len: + msg = "`max_model_len` is redundant and will be removed in v0.12." + warnings.warn(DeprecationWarning(msg), stacklevel=2) + if max_num_reqs: + msg = "`max_num_reqs` is redundant and will be removed in v0.12." + warnings.warn(DeprecationWarning(msg), stacklevel=2) + self.model_config = model_config self.scheduler_config = scheduler_config self.mm_registry = mm_registry From e9b6f7b6c3de76ae24bec696e13cbe084ec13069 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 15:08:17 +0000 Subject: [PATCH 057/130] Avoid conflicting profile runs between API servers Signed-off-by: DarkLight1337 --- tests/multimodal/test_utils.py | 16 +---- vllm/config/__init__.py | 22 +----- vllm/engine/arg_utils.py | 30 --------- vllm/multimodal/utils.py | 22 ++---- vllm/v1/engine/processor.py | 119 +++++++++++++++++++++------------ 5 files changed, 84 insertions(+), 125 deletions(-) diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index 16ec3fbe66a7..8523ee9ba240 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -184,7 +184,6 @@ async def test_fetch_video_http(video_url: str, num_frames: int): available_device_count=1, engine_device_count=1, expected_gpu_allocation=[], - expected_mm_processors_per_gpu=0, ), dict( mm_processor_device="cuda", @@ -192,7 +191,6 @@ async def test_fetch_video_http(video_url: str, num_frames: int): available_device_count=1, engine_device_count=1, expected_gpu_allocation=["cuda:0"], - expected_mm_processors_per_gpu=1, ), # Use Engine GPUs dict( @@ -201,7 +199,6 @@ async def test_fetch_video_http(video_url: str, num_frames: int): available_device_count=1, engine_device_count=1, expected_gpu_allocation=["cuda:0", "cuda:0"], - expected_mm_processors_per_gpu=2, ), dict( mm_processor_device="cuda", @@ -209,7 +206,6 @@ async def test_fetch_video_http(video_url: str, num_frames: int): available_device_count=1, engine_device_count=2, expected_gpu_allocation=["cuda:0", "cuda:0"], - expected_mm_processors_per_gpu=2, ), dict( mm_processor_device="cuda", @@ -217,7 +213,6 @@ async def test_fetch_video_http(video_url: str, num_frames: int): available_device_count=2, engine_device_count=2, expected_gpu_allocation=["cuda:0", "cuda:1"], - expected_mm_processors_per_gpu=1, ), dict( mm_processor_device="cuda", @@ -225,7 +220,6 @@ async def test_fetch_video_http(video_url: str, num_frames: int): available_device_count=2, engine_device_count=2, expected_gpu_allocation=["cuda:0", "cuda:1", "cuda:0"], - expected_mm_processors_per_gpu=2, ), # Use excess GPUs dict( @@ -234,7 +228,6 @@ async def test_fetch_video_http(video_url: str, num_frames: int): available_device_count=3, engine_device_count=2, expected_gpu_allocation=["cuda:2", "cuda:2"], - expected_mm_processors_per_gpu=2, ), dict( mm_processor_device="cuda", @@ -242,7 +235,6 @@ async def test_fetch_video_http(video_url: str, num_frames: int): available_device_count=4, engine_device_count=2, expected_gpu_allocation=["cuda:2", "cuda:3"], - expected_mm_processors_per_gpu=1, ), dict( mm_processor_device="cuda", @@ -250,7 +242,6 @@ async def test_fetch_video_http(video_url: str, num_frames: int): available_device_count=4, engine_device_count=2, expected_gpu_allocation=["cuda:2", "cuda:3", "cuda:2"], - expected_mm_processors_per_gpu=2, ), ], ) @@ -261,12 +252,8 @@ def test_allocate_gpu_mm_processors(case): available_device_count = case["available_device_count"] engine_device_count = case["engine_device_count"] expected_gpu_allocation = case["expected_gpu_allocation"] - expected_mm_processors_per_gpu = case["expected_mm_processors_per_gpu"] - ( - gpu_allocation, - mm_processors_per_gpu, - ) = allocate_gpu_mm_processors( + gpu_allocation = allocate_gpu_mm_processors( mm_processor_device, mm_processor_count, available_device_count=available_device_count, @@ -274,7 +261,6 @@ def test_allocate_gpu_mm_processors(case): ) assert gpu_allocation == expected_gpu_allocation - assert mm_processors_per_gpu == expected_mm_processors_per_gpu # yapf: disable diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 025b1aef30cb..6c503c54bd41 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -422,12 +422,6 @@ class ModelConfig: `mm_processor_cache_gb * (api_server_count + data_parallel_size)`. Set to `0` to disable this cache completely (not recommended).""" - mm_processors_per_gpu: int = 0 - """ - [Internal] The maximum number of multi-modal processors that use each GPU. - - This is needed to determine the peak memory of multi-modal processing - in the case of API server scale-out.""" override_neuron_config: dict[str, Any] = field(default_factory=dict) """Initialize non-default neuron config or override default neuron config that are specific to Neuron devices, this argument will be used to @@ -865,12 +859,6 @@ def set_mm_processor_kwargs(self, value: dict[str, Any]) -> None: mm_config.mm_processor_kwargs.update(value) - def set_mm_processors_per_gpu(self, value: int) -> None: - self.mm_processors_per_gpu = value - - if mm_config := self.multimodal_config: - mm_config.mm_processors_per_gpu = value - def _get_encoder_config(self): return get_sentence_transformer_tokenizer_config( self.model, self.revision) @@ -2508,7 +2496,7 @@ class MultiModalConfig: For example, to set num_frames for video, set `--media-io-kwargs '{"video": {"num_frames": 40} }'` """ - mm_processor_kwargs: Optional[dict[str, Any]] = None + mm_processor_kwargs: Optional[dict[str, object]] = None """ Overrides for the multi-modal processor obtained from `transformers.AutoProcessor.from_pretrained`. @@ -2530,14 +2518,6 @@ class MultiModalConfig: Set to `0` to disable this cache completely (not recommended). """ - mm_processors_per_gpu: int = 0 - """ - [Internal] The maximum number of multi-modal processors that use each GPU. - - This is needed to determine the peak memory of multi-modal processing - in the case of API server scale-out. - """ - interleave_mm_strings: bool = False """ Enable fully interleaved support for multimodal prompts. diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index fe6417125097..4902e8461b5c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -352,7 +352,6 @@ class EngineArgs: MultiModalConfig.mm_processor_kwargs disable_mm_preprocessor_cache: bool = False # DEPRECATED mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb - mm_processors_per_gpu: int = MultiModalConfig.mm_processors_per_gpu # LoRA fields enable_lora: bool = False enable_lora_bias: bool = LoRAConfig.bias_enabled @@ -928,7 +927,6 @@ def create_model_config(self) -> ModelConfig: config_format=self.config_format, mm_processor_kwargs=self.mm_processor_kwargs, mm_processor_cache_gb=self.mm_processor_cache_gb, - mm_processors_per_gpu=self.mm_processors_per_gpu, override_neuron_config=self.override_neuron_config, override_pooler_config=self.override_pooler_config, logits_processor_pattern=self.logits_processor_pattern, @@ -1253,34 +1251,6 @@ def create_engine_config( "between API and engine core processes.") model_config.set_mm_processor_cache_gb(0) - if mm_processor_kwargs := self.mm_processor_kwargs: - from vllm.multimodal.utils import allocate_gpu_mm_processors - - mm_processor_device: str = mm_processor_kwargs.get( - "device", "cpu") - if mm_processor_device != "cpu": - ( - gpu_allocation, - mm_processors_per_gpu, - ) = allocate_gpu_mm_processors( - mm_processor_device, - self.api_process_count, - available_device_count=current_platform.device_count( - ), # type: ignore - engine_device_count=parallel_config. - world_size_across_dp, - ) - - new_device = gpu_allocation[self.api_process_rank] - logger.info( - "Multi-modal processor will be run on device %s", - new_device) - - model_config.set_mm_processor_kwargs( - {"device": new_device}) - model_config.set_mm_processors_per_gpu( - mm_processors_per_gpu) - speculative_config = self.create_speculative_config( target_model_config=model_config, target_parallel_config=parallel_config, diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 3f598b765100..62e7a86d161b 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -3,7 +3,6 @@ import asyncio import atexit -from collections import Counter from collections.abc import Iterable from concurrent.futures import ThreadPoolExecutor from itertools import groupby @@ -340,18 +339,13 @@ def allocate_gpu_mm_processors( *, available_device_count: int, engine_device_count: int, -) -> tuple[list[str], int]: +) -> list[str]: """ Given `--mm_processor_kwargs.device` and the number of multi-modal processors, return the GPU allocation information. - + Returns: - A tuple `(mm_processor_gpus, mm_processors_per_gpu)`, where: - - `gpu_allocation` is the device to allocate for each - multi-modal processor. - - `mm_processors_per_gpu` is the number of - multi-modal processors allocated to each GPU that is used - by vLLM engine. + The device to allocate for each multi-modal processor. """ # In API server scale-out, allocate_gpu_mm_processors is called twice. # The first call happens in vllm.entrypoints.cli.serve and corresponds @@ -377,15 +371,7 @@ def allocate_gpu_mm_processors( (device_idx, ) = map(int, rest) processor_gpu_idxs = [device_idx] * mm_processor_count - gpu_allocation = [ - f"{device_type}:{gpu_idx}" for gpu_idx in processor_gpu_idxs - ] - mm_processors_per_gpu = max( - Counter(processor_gpu_idxs).values(), - default=0, - ) - - return gpu_allocation, mm_processors_per_gpu + return [f"{device_type}:{gpu_idx}" for gpu_idx in processor_gpu_idxs] def argsort_mm_positions( diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 0c4eb32496a4..86de7449ff7f 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -16,7 +16,9 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange from vllm.multimodal.processing import EncDecMultiModalProcessor -from vllm.multimodal.utils import argsort_mm_positions +from vllm.multimodal.utils import (allocate_gpu_mm_processors, + argsort_mm_positions) +from vllm.platforms import current_platform from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import TokenizerGroup @@ -47,6 +49,7 @@ def __init__( self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config self.lora_config = vllm_config.lora_config + self.parallel_config = vllm_config.parallel_config self.scheduler_config = vllm_config.scheduler_config self.decoding_config = vllm_config.decoding_config self.tokenizer = tokenizer @@ -429,49 +432,83 @@ def profile_run(self) -> None: model_config = self.model_config mm_config = model_config.multimodal_config - processor_memory_usage = 0 + if not mm_config: + return - if mm_config and (mm_processor_kwargs := - mm_config.mm_processor_kwargs): - mm_processor_device = torch.device( - mm_processor_kwargs.get("device", "cpu")) - device_mult = mm_config.mm_processors_per_gpu + if mm_processor_kwargs := mm_config.mm_processor_kwargs: + orig_device = str(mm_processor_kwargs.get("device", "cpu")) + if orig_device == "cpu": + return - if mm_processor_device != "cpu" and device_mult > 0: - scheduler_config = self.scheduler_config - mm_budget = MultiModalBudget( - model_config, - scheduler_config, - self.mm_registry, - ) + # Peak memory usage (required for this profiling) + # is only tracked for CUDA + if not current_platform.is_cuda_alike(): + return + + parallel_config = self.parallel_config + device_count = current_platform.device_count() # type: ignore + + gpu_allocation = allocate_gpu_mm_processors( + orig_device, + parallel_config.api_process_count, + available_device_count=device_count, + engine_device_count=parallel_config.world_size_across_dp, + ) + + new_device = gpu_allocation[parallel_config.api_process_rank] + logger.info("Multi-modal processor will be run on device %s", + new_device) - self.mm_registry.reset_processor_cache(model_config) + new_device_ranks = [ + rank for rank in range(parallel_config.api_process_count) + if gpu_allocation[rank] == new_device + ] + + model_config.set_mm_processor_kwargs({"device": new_device}) + + # Only run profiling on the first Processor for each device, + # then multiply the usage by the number of processors for that + # device. + # Compared to running profiling on every Processor in parallel, + # this avoids non-deterministic peak memory usage calculation. + if parallel_config.api_process_rank != new_device_ranks[0]: + return + + scheduler_config = self.scheduler_config + mm_budget = MultiModalBudget( + model_config, + scheduler_config, + self.mm_registry, + ) + + self.mm_registry.reset_processor_cache(model_config) - baseline_snapshot = MemorySnapshot(device=mm_processor_device) + baseline_snapshot = MemorySnapshot(device=new_device) + + # Only run this check if we are sure that the EngineCore is not + # running profiling on the same GPU + new_device_index = torch.device(new_device).index or 0 + if new_device_index < parallel_config.world_size_across_dp: check_enough_init_memory(baseline_snapshot, self.cache_config) - with memory_profiling(baseline_snapshot) as diff: - for modality, max_items_per_prompt in ( - mm_budget.max_items_per_prompt_by_modality.items() - ): - self.mm_registry.get_decoder_dummy_data( - model_config=model_config, - seq_len=scheduler_config.max_num_batched_tokens, - mm_counts={modality: max_items_per_prompt}, - ) - - processor_memory_usage = diff.torch_peak_increase * device_mult - logger.info( - "Input processing took %.4f GiB and %.6f seconds on %s", - processor_memory_usage / GiB_bytes, - diff.profile_time, - mm_processor_device, - ) - if processor_memory_usage > diff.before_profile.free_memory: - raise ValueError( - f"No available memory in {mm_processor_device} " - f"for multi-modal processing. " - f"Try increasing `gpu_memory_utilization` " - f"or reduce `api_server_count`.") - - self.processor_memory_usage = processor_memory_usage + with memory_profiling(baseline_snapshot) as diff: + for modality, max_items_per_prompt in ( + mm_budget.max_items_per_prompt_by_modality.items()): + self.mm_registry.get_decoder_dummy_data( + model_config=model_config, + seq_len=scheduler_config.max_num_batched_tokens, + mm_counts={modality: max_items_per_prompt}, + ) + + memory_usage = diff.torch_peak_increase * len(new_device_ranks) + logger.info( + "Input processing took %.4f GiB and %.6f seconds on %s", + memory_usage / GiB_bytes, + diff.profile_time, + new_device, + ) + if memory_usage > diff.before_profile.free_memory: + raise ValueError(f"No available memory in {new_device} " + f"for multi-modal processing. " + f"Try increasing `gpu_memory_utilization` " + f"or reduce `api_server_count`.") From e74ec9db24655b02a6b87471eaa07d24471a73a7 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 15:12:13 +0000 Subject: [PATCH 058/130] Rename Signed-off-by: DarkLight1337 --- vllm/config/__init__.py | 2 +- vllm/v1/engine/processor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 6c503c54bd41..4727e6a9589e 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -847,7 +847,7 @@ def set_mm_processor_cache_gb(self, value: int) -> None: if mm_config := self.multimodal_config: mm_config.mm_processor_cache_gb = value - def set_mm_processor_kwargs(self, value: dict[str, Any]) -> None: + def update_mm_processor_kwargs(self, value: dict[str, Any]) -> None: if self.mm_processor_kwargs is None: self.mm_processor_kwargs = {} diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 86de7449ff7f..97fe696e5c01 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -464,7 +464,7 @@ def profile_run(self) -> None: if gpu_allocation[rank] == new_device ] - model_config.set_mm_processor_kwargs({"device": new_device}) + model_config.update_mm_processor_kwargs({"device": new_device}) # Only run profiling on the first Processor for each device, # then multiply the usage by the number of processors for that From fe15a965c91eead7e9d5a77faef0184705ea41ee Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 15:15:09 +0000 Subject: [PATCH 059/130] Reword Signed-off-by: DarkLight1337 --- vllm/v1/engine/processor.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 97fe696e5c01..122a641a4c14 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -502,7 +502,7 @@ def profile_run(self) -> None: memory_usage = diff.torch_peak_increase * len(new_device_ranks) logger.info( - "Input processing took %.4f GiB and %.6f seconds on %s", + "Multi-modal processing took %.4f GiB and %.6f seconds on %s", memory_usage / GiB_bytes, diff.profile_time, new_device, @@ -510,5 +510,4 @@ def profile_run(self) -> None: if memory_usage > diff.before_profile.free_memory: raise ValueError(f"No available memory in {new_device} " f"for multi-modal processing. " - f"Try increasing `gpu_memory_utilization` " - f"or reduce `api_server_count`.") + f"Try reducing `api_server_count`.") From 47d5b8137268d2c3bd32481fec1790a60cb92f1b Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 15:15:42 +0000 Subject: [PATCH 060/130] Fix Signed-off-by: DarkLight1337 --- vllm/v1/engine/processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 122a641a4c14..14cc7a7213c3 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -488,7 +488,7 @@ def profile_run(self) -> None: # Only run this check if we are sure that the EngineCore is not # running profiling on the same GPU new_device_index = torch.device(new_device).index or 0 - if new_device_index < parallel_config.world_size_across_dp: + if new_device_index >= parallel_config.world_size_across_dp: check_enough_init_memory(baseline_snapshot, self.cache_config) with memory_profiling(baseline_snapshot) as diff: From b66826c2b2a9bc19c4900da83b5c0b360fdcb93d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 15:18:54 +0000 Subject: [PATCH 061/130] Warn Signed-off-by: DarkLight1337 --- vllm/v1/engine/processor.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 14cc7a7213c3..fb75389af52c 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -488,7 +488,14 @@ def profile_run(self) -> None: # Only run this check if we are sure that the EngineCore is not # running profiling on the same GPU new_device_index = torch.device(new_device).index or 0 - if new_device_index >= parallel_config.world_size_across_dp: + if new_device_index < parallel_config.world_size_across_dp: + logger.warning( + "Both EngineCore and multi-modal processing are using " + "the same GPU (%s). This may result in inaccurate memory " + "profiling, and resource contention during inference.", + new_device, + ) + else: check_enough_init_memory(baseline_snapshot, self.cache_config) with memory_profiling(baseline_snapshot) as diff: @@ -508,6 +515,6 @@ def profile_run(self) -> None: new_device, ) if memory_usage > diff.before_profile.free_memory: - raise ValueError(f"No available memory in {new_device} " + raise ValueError(f"Not enough memory in {new_device} " f"for multi-modal processing. " f"Try reducing `api_server_count`.") From 5e848343ad4150be74b49aeadeacc071c82b07a6 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 15:20:36 +0000 Subject: [PATCH 062/130] Add TODO Signed-off-by: DarkLight1337 --- vllm/v1/engine/processor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index fb75389af52c..10f08b06fc21 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -487,6 +487,7 @@ def profile_run(self) -> None: # Only run this check if we are sure that the EngineCore is not # running profiling on the same GPU + # TODO: world_size_across_dp is too conservative for multi-node new_device_index = torch.device(new_device).index or 0 if new_device_index < parallel_config.world_size_across_dp: logger.warning( From f01d3d1655fd5ebb67ba50d9cca712a8ab300baa Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 15:26:53 +0000 Subject: [PATCH 063/130] Comment Signed-off-by: DarkLight1337 --- vllm/v1/engine/processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 10f08b06fc21..4c239c380e46 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -485,7 +485,7 @@ def profile_run(self) -> None: baseline_snapshot = MemorySnapshot(device=new_device) - # Only run this check if we are sure that the EngineCore is not + # Only check init memory if we are sure that the EngineCore is not # running profiling on the same GPU # TODO: world_size_across_dp is too conservative for multi-node new_device_index = torch.device(new_device).index or 0 From 5e8a9fc85791e270b6ee180acb1c12d1b42e7cc8 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 15:29:38 +0000 Subject: [PATCH 064/130] Comment Signed-off-by: DarkLight1337 --- vllm/v1/engine/processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 4c239c380e46..21060251a512 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -486,7 +486,7 @@ def profile_run(self) -> None: baseline_snapshot = MemorySnapshot(device=new_device) # Only check init memory if we are sure that the EngineCore is not - # running profiling on the same GPU + # loading weights or running profiling on the same GPU # TODO: world_size_across_dp is too conservative for multi-node new_device_index = torch.device(new_device).index or 0 if new_device_index < parallel_config.world_size_across_dp: From f59b27dcdbe89cf7b8a373ba99372e905a3fa96d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 15:30:52 +0000 Subject: [PATCH 065/130] Remove redundant reset Signed-off-by: DarkLight1337 --- vllm/v1/engine/processor.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 21060251a512..62fbb74a31e1 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -481,8 +481,6 @@ def profile_run(self) -> None: self.mm_registry, ) - self.mm_registry.reset_processor_cache(model_config) - baseline_snapshot = MemorySnapshot(device=new_device) # Only check init memory if we are sure that the EngineCore is not From 210f84973bdf262fa9b296237ea40a0876ed9d14 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 15:52:01 +0000 Subject: [PATCH 066/130] Fix not working on other platforms Signed-off-by: DarkLight1337 --- vllm/v1/engine/processor.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 62fbb74a31e1..0e53f0011f3f 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -440,11 +440,6 @@ def profile_run(self) -> None: if orig_device == "cpu": return - # Peak memory usage (required for this profiling) - # is only tracked for CUDA - if not current_platform.is_cuda_alike(): - return - parallel_config = self.parallel_config device_count = current_platform.device_count() # type: ignore @@ -466,6 +461,11 @@ def profile_run(self) -> None: model_config.update_mm_processor_kwargs({"device": new_device}) + # Peak memory usage (required for this profiling) + # is only tracked for CUDA + if not current_platform.is_cuda_alike(): + return + # Only run profiling on the first Processor for each device, # then multiply the usage by the number of processors for that # device. From 90eeeaa3649da893f8c7bc9bff545a029374c473 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 15:53:33 +0000 Subject: [PATCH 067/130] Comment Signed-off-by: DarkLight1337 --- vllm/v1/engine/processor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 0e53f0011f3f..116a390642d0 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -440,6 +440,8 @@ def profile_run(self) -> None: if orig_device == "cpu": return + # Allocate the GPU for each processor to avoid using the same + # GPUs as EngineCore parallel_config = self.parallel_config device_count = current_platform.device_count() # type: ignore From ea4e97f566e2068c0832185946213e969dc83352 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 15:54:53 +0000 Subject: [PATCH 068/130] Doc Signed-off-by: DarkLight1337 --- vllm/multimodal/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 62e7a86d161b..23c7078cab5d 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -341,8 +341,8 @@ def allocate_gpu_mm_processors( engine_device_count: int, ) -> list[str]: """ - Given `--mm_processor_kwargs.device` and the number of multi-modal - processors, return the GPU allocation information. + Allocate each processor to a GPU that is not being used by EngineCore, + if possible. Returns: The device to allocate for each multi-modal processor. From 999fe9f27b684c7e8f80f9fa0d66d2d51943a897 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 15:57:51 +0000 Subject: [PATCH 069/130] Update tests Signed-off-by: DarkLight1337 --- tests/multimodal/test_utils.py | 29 +++++++++++++++++++++++++++++ vllm/multimodal/utils.py | 4 ---- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index 8523ee9ba240..103fb240ef8f 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -243,6 +243,35 @@ async def test_fetch_video_http(video_url: str, num_frames: int): engine_device_count=2, expected_gpu_allocation=["cuda:2", "cuda:3", "cuda:2"], ), + # Specific device + dict( + mm_processor_device="cuda:0", + mm_processor_count=2, + available_device_count=4, + engine_device_count=2, + expected_gpu_allocation=["cuda:0", "cuda:0"], + ), + dict( + mm_processor_device="cuda:1", + mm_processor_count=2, + available_device_count=4, + engine_device_count=2, + expected_gpu_allocation=["cuda:1", "cuda:1"], + ), + dict( + mm_processor_device="cuda:2", + mm_processor_count=2, + available_device_count=4, + engine_device_count=2, + expected_gpu_allocation=["cuda:2", "cuda:2"], + ), + dict( + mm_processor_device="cuda:4", + mm_processor_count=2, + available_device_count=4, + engine_device_count=2, + expected_gpu_allocation=["cuda:4", "cuda:4"], + ), ], ) # yapf: enable diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 23c7078cab5d..d9e5faec4015 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -347,10 +347,6 @@ def allocate_gpu_mm_processors( Returns: The device to allocate for each multi-modal processor. """ - # In API server scale-out, allocate_gpu_mm_processors is called twice. - # The first call happens in vllm.entrypoints.cli.serve and corresponds - # to len(rest) == 0, resulting in each server targeting a specific device. - # The second call happens in arg_utils.py and corresponds to len(rest) = 1 device_type, *rest = mm_processor_device.rsplit(":", 1) if len(rest) == 0: # Try to run each processor on a different GPU, preferably those From 1f3fb95099eebc41327817577aa526d03949d938 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 15:59:41 +0000 Subject: [PATCH 070/130] Update Signed-off-by: DarkLight1337 --- tests/multimodal/test_utils.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index 103fb240ef8f..3a54c4540ce5 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -251,13 +251,6 @@ async def test_fetch_video_http(video_url: str, num_frames: int): engine_device_count=2, expected_gpu_allocation=["cuda:0", "cuda:0"], ), - dict( - mm_processor_device="cuda:1", - mm_processor_count=2, - available_device_count=4, - engine_device_count=2, - expected_gpu_allocation=["cuda:1", "cuda:1"], - ), dict( mm_processor_device="cuda:2", mm_processor_count=2, @@ -265,6 +258,7 @@ async def test_fetch_video_http(video_url: str, num_frames: int): engine_device_count=2, expected_gpu_allocation=["cuda:2", "cuda:2"], ), + # Out-of-bounds device dict( mm_processor_device="cuda:4", mm_processor_count=2, From 28bde79f87844c3f058e0b360048b9f7a3ca6df3 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 16:04:46 +0000 Subject: [PATCH 071/130] Simplify code Signed-off-by: DarkLight1337 --- vllm/config/__init__.py | 5 ++--- vllm/inputs/registry.py | 2 +- vllm/v1/engine/processor.py | 11 +++-------- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 4727e6a9589e..bcf877881d61 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -2524,10 +2524,9 @@ class MultiModalConfig: """ @property - def is_mm_processing_gpu(self) -> bool: + def mm_processing_device(self) -> str: kwargs = self.mm_processor_kwargs or {} - - return kwargs.get("device", "cpu") != "cpu" + return str(kwargs.get("device", "cpu")) def compute_hash(self) -> str: """ diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 35f07497bd06..df311096f81d 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -146,7 +146,7 @@ def _postprocess_output( output: JSONTree, ) -> JSONTree: mm_config = self.model_config.get_multimodal_config() - is_mm_processing_gpu = mm_config.is_mm_processing_gpu + is_mm_processing_gpu = mm_config.mm_processing_device != "cpu" def _postprocess_one(x: object): if isinstance(x, torch.Tensor): diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 116a390642d0..3e548b502671 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -435,18 +435,13 @@ def profile_run(self) -> None: if not mm_config: return - if mm_processor_kwargs := mm_config.mm_processor_kwargs: - orig_device = str(mm_processor_kwargs.get("device", "cpu")) - if orig_device == "cpu": - return - - # Allocate the GPU for each processor to avoid using the same - # GPUs as EngineCore + if mm_config.mm_processing_device != "cpu": + # Try to avoid using the same GPU as EngineCore parallel_config = self.parallel_config device_count = current_platform.device_count() # type: ignore gpu_allocation = allocate_gpu_mm_processors( - orig_device, + mm_config.mm_processing_device, parallel_config.api_process_count, available_device_count=device_count, engine_device_count=parallel_config.world_size_across_dp, From ba6c0d6c3a077c99deced0414ced0e0cde2f81bf Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Aug 2025 16:11:49 +0000 Subject: [PATCH 072/130] Clean up Signed-off-by: DarkLight1337 --- vllm/v1/engine/processor.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 3e548b502671..d5083c1c8461 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -447,15 +447,11 @@ def profile_run(self) -> None: engine_device_count=parallel_config.world_size_across_dp, ) - new_device = gpu_allocation[parallel_config.api_process_rank] + api_process_rank = parallel_config.api_process_rank + new_device = gpu_allocation[api_process_rank] + logger.info("Multi-modal processor will be run on device %s", new_device) - - new_device_ranks = [ - rank for rank in range(parallel_config.api_process_count) - if gpu_allocation[rank] == new_device - ] - model_config.update_mm_processor_kwargs({"device": new_device}) # Peak memory usage (required for this profiling) @@ -468,7 +464,7 @@ def profile_run(self) -> None: # device. # Compared to running profiling on every Processor in parallel, # this avoids non-deterministic peak memory usage calculation. - if parallel_config.api_process_rank != new_device_ranks[0]: + if api_process_rank != gpu_allocation.index(new_device): return scheduler_config = self.scheduler_config @@ -486,7 +482,7 @@ def profile_run(self) -> None: new_device_index = torch.device(new_device).index or 0 if new_device_index < parallel_config.world_size_across_dp: logger.warning( - "Both EngineCore and multi-modal processing are using " + "Both EngineCore and multi-modal processor are using " "the same GPU (%s). This may result in inaccurate memory " "profiling, and resource contention during inference.", new_device, @@ -503,7 +499,8 @@ def profile_run(self) -> None: mm_counts={modality: max_items_per_prompt}, ) - memory_usage = diff.torch_peak_increase * len(new_device_ranks) + usage_mult = gpu_allocation.count(new_device) + memory_usage = diff.torch_peak_increase * usage_mult logger.info( "Multi-modal processing took %.4f GiB and %.6f seconds on %s", memory_usage / GiB_bytes, @@ -512,5 +509,5 @@ def profile_run(self) -> None: ) if memory_usage > diff.before_profile.free_memory: raise ValueError(f"Not enough memory in {new_device} " - f"for multi-modal processing. " + f"for multi-modal processor. " f"Try reducing `api_server_count`.") From 949cf54d9db7e4cd37739a24a62ef3b0b4163771 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 15 Aug 2025 08:40:11 +0000 Subject: [PATCH 073/130] Fix tests Signed-off-by: DarkLight1337 --- examples/others/tensorize_vllm_model.py | 9 +-------- tests/worker/test_profile.py | 2 +- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py index 559c7c493aca..2b7f0beab227 100644 --- a/examples/others/tensorize_vllm_model.py +++ b/examples/others/tensorize_vllm_model.py @@ -1,8 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import argparse -import dataclasses import json import logging import os @@ -327,12 +325,7 @@ def main(): if args.command == "serialize": - eng_args_dict = {f.name: getattr(args, f.name) for f in - dataclasses.fields(EngineArgs)} - - engine_args = EngineArgs.from_cli_args( - argparse.Namespace(**eng_args_dict) - ) + engine_args = EngineArgs.from_cli_args(args) input_dir = tensorizer_dir.rstrip('/') suffix = args.suffix if args.suffix else uuid.uuid4().hex diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py index d8767f700b57..a415620eb64a 100644 --- a/tests/worker/test_profile.py +++ b/tests/worker/test_profile.py @@ -35,7 +35,7 @@ def test_gpu_memory_profiling(): ) # Set 10GiB as the total gpu ram to be device-agnostic - def mock_mem_info(): + def mock_mem_info(device: torch.types.Device = None): current_usage = torch.cuda.memory_stats( )["allocated_bytes.all.current"] mock_total_bytes = 10 * 1024**3 From bdbe1f4355a7d70fb3e8a404b0061779ca4c807b Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 06:37:10 +0000 Subject: [PATCH 074/130] Update `supports_ipc_cache` Signed-off-by: DarkLight1337 --- vllm/multimodal/cache.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py index 0e81cb6d4d19..95765bfbe0f2 100644 --- a/vllm/multimodal/cache.py +++ b/vllm/multimodal/cache.py @@ -380,8 +380,9 @@ def _enable_processor_cache( def _enable_ipc_cache(vllm_config: "VllmConfig") -> bool: parallel_config = vllm_config.parallel_config - supports_ipc_cache = (parallel_config.data_parallel_size == 1 - or parallel_config.data_parallel_external_lb) + supports_ipc_cache = (parallel_config.api_process_count == 1 + and (parallel_config.data_parallel_size == 1 + or parallel_config.data_parallel_external_lb)) return supports_ipc_cache From d52aa966820cc88f14a9547bcc4fba80f63fc07a Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 06:51:19 +0000 Subject: [PATCH 075/130] [Frontend] Pass API server count to each process Signed-off-by: DarkLight1337 --- examples/others/tensorize_vllm_model.py | 9 +----- .../test_api_server_process_manager.py | 6 ++-- vllm/config/parallel.py | 6 ++++ vllm/engine/arg_utils.py | 9 +++++- vllm/entrypoints/cli/serve.py | 18 ++++++----- vllm/multimodal/cache.py | 5 ++-- vllm/v1/engine/core_client.py | 1 + vllm/v1/utils.py | 30 +++++++++++-------- 8 files changed, 50 insertions(+), 34 deletions(-) diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py index 559c7c493aca..2b7f0beab227 100644 --- a/examples/others/tensorize_vllm_model.py +++ b/examples/others/tensorize_vllm_model.py @@ -1,8 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import argparse -import dataclasses import json import logging import os @@ -327,12 +325,7 @@ def main(): if args.command == "serialize": - eng_args_dict = {f.name: getattr(args, f.name) for f in - dataclasses.fields(EngineArgs)} - - engine_args = EngineArgs.from_cli_args( - argparse.Namespace(**eng_args_dict) - ) + engine_args = EngineArgs.from_cli_args(args) input_dir = tensorizer_dir.rstrip('/') suffix = args.suffix if args.suffix else uuid.uuid4().hex diff --git a/tests/entrypoints/test_api_server_process_manager.py b/tests/entrypoints/test_api_server_process_manager.py index e4af60a78265..882382a38543 100644 --- a/tests/entrypoints/test_api_server_process_manager.py +++ b/tests/entrypoints/test_api_server_process_manager.py @@ -36,10 +36,10 @@ def api_server_args(): "localhost:8000", "sock": sock, - "args": - "test_args", # Simple string to avoid pickling issues "num_servers": 3, + "args_per_server": + ["test_args"] * 3, # Simple string to avoid pickling issues "input_addresses": [ "tcp://127.0.0.1:5001", "tcp://127.0.0.1:5002", "tcp://127.0.0.1:5003" @@ -60,7 +60,7 @@ def test_api_server_process_manager_init(api_server_args, with_stats_update): global WORKER_RUNTIME_SECONDS WORKER_RUNTIME_SECONDS = 0.5 - # Copy the args to avoid mutating the + # Copy the args to avoid mutating them args = api_server_args.copy() if not with_stats_update: diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 9ea883d4a03c..b7f7231ef6da 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -96,6 +96,12 @@ class ParallelConfig: between local data parallel ranks, but an external LB balances between vLLM nodes/replicas. Set explicitly in conjunction with --data-parallel-start-rank.""" + + api_process_count: int = 1 + """[Internal] The number of API processes initialized.""" + api_process_rank: int = 0 + """[Internal] The rank of this API process.""" + enable_expert_parallel: bool = False """Use expert parallelism instead of tensor parallelism for MoE layers.""" enable_eplb: bool = False diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 9e7c95ea5205..91c98f6e64b2 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -303,6 +303,8 @@ class EngineArgs: data_parallel_rpc_port: Optional[int] = None data_parallel_hybrid_lb: bool = False data_parallel_backend: str = ParallelConfig.data_parallel_backend + api_process_count: int = ParallelConfig.api_process_count + api_process_rank: int = ParallelConfig.api_process_rank enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config") enable_eplb: bool = ParallelConfig.enable_eplb @@ -895,7 +897,10 @@ def from_cli_args(cls, args: argparse.Namespace): # Get the list of attributes of this dataclass. attrs = [attr.name for attr in dataclasses.fields(cls)] # Set the attributes from the parsed arguments. - engine_args = cls(**{attr: getattr(args, attr) for attr in attrs}) + engine_args = cls(**{ + attr: getattr(args, attr) + for attr in attrs if hasattr(args, attr) + }) return engine_args def create_model_config(self) -> ModelConfig: @@ -1280,6 +1285,8 @@ def create_engine_config( data_parallel_rpc_port=data_parallel_rpc_port, data_parallel_backend=self.data_parallel_backend, data_parallel_hybrid_lb=self.data_parallel_hybrid_lb, + api_process_count=self.api_process_count, + api_process_rank=self.api_process_rank, enable_expert_parallel=self.enable_expert_parallel, enable_eplb=self.enable_eplb, eplb_config=self.eplb_config, diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 803a3e004656..2ebffe9cb690 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -3,6 +3,7 @@ import argparse import signal +from copy import deepcopy from typing import Optional import uvloop @@ -135,10 +136,11 @@ def signal_handler(signum, frame): def run_multi_api_server(args: argparse.Namespace): assert not args.headless - num_api_servers = args.api_server_count + num_api_servers: int = args.api_server_count assert num_api_servers > 0 - orig_mm_processor_cache_gb = args.mm_processor_cache_gb + # No need to set api_process_rank for EngineCore processes + args.api_process_count = args.api_server_count if num_api_servers > 1: setup_multiprocess_prometheus() @@ -151,7 +153,6 @@ def run_multi_api_server(args: argparse.Namespace): engine_args = vllm.AsyncEngineArgs.from_cli_args(args) usage_context = UsageContext.OPENAI_API_SERVER vllm_config = engine_args.create_engine_config(usage_context=usage_context) - model_config = vllm_config.model_config if num_api_servers > 1: if not envs.VLLM_USE_V1: @@ -161,10 +162,6 @@ def run_multi_api_server(args: argparse.Namespace): raise ValueError("VLLM_ALLOW_RUNTIME_LORA_UPDATING cannot be used " "with api_server_count > 1") - if model_config.is_multimodal_model and orig_mm_processor_cache_gb > 0: - logger.warning("Multi-modal processor cache is disabled because " - "it is not compatible with `api_server_count > 1`.") - executor_class = Executor.get_class(vllm_config) log_stats = not engine_args.disable_log_stats @@ -174,6 +171,11 @@ def run_multi_api_server(args: argparse.Namespace): hybrid_dp_lb = parallel_config.data_parallel_hybrid_lb assert external_dp_lb or hybrid_dp_lb or dp_rank == 0 + # Set api_process_rank for API server processes + args_per_server = [deepcopy(args) for _ in range(num_api_servers)] + for server_idx in range(num_api_servers): + args_per_server[server_idx].api_process_rank = server_idx + api_server_manager: Optional[APIServerProcessManager] = None with launch_core_engines(vllm_config, executor_class, log_stats, @@ -185,7 +187,7 @@ def run_multi_api_server(args: argparse.Namespace): target_server_fn=run_api_server_worker_proc, listen_address=listen_address, sock=sock, - args=args, + args_per_server=args_per_server, num_servers=num_api_servers, input_addresses=addresses.inputs, output_addresses=addresses.outputs, diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py index 0e81cb6d4d19..95765bfbe0f2 100644 --- a/vllm/multimodal/cache.py +++ b/vllm/multimodal/cache.py @@ -380,8 +380,9 @@ def _enable_processor_cache( def _enable_ipc_cache(vllm_config: "VllmConfig") -> bool: parallel_config = vllm_config.parallel_config - supports_ipc_cache = (parallel_config.data_parallel_size == 1 - or parallel_config.data_parallel_external_lb) + supports_ipc_cache = (parallel_config.api_process_count == 1 + and (parallel_config.data_parallel_size == 1 + or parallel_config.data_parallel_external_lb)) return supports_ipc_cache diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 079dd9a7d38d..54231cebea20 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -772,6 +772,7 @@ def __init__(self, client_addresses=client_addresses, ) + self.client_count = client_count self.client_index = client_index self.outputs_queue = asyncio.Queue[Union[EngineCoreOutputs, Exception]]() diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index b5750c82db02..fb8b1ba4ac2a 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -123,8 +123,8 @@ def __init__( target_server_fn: Callable, listen_address: str, sock: Any, - args: argparse.Namespace, num_servers: int, + args_per_server: list[argparse.Namespace], input_addresses: list[str], output_addresses: list[str], stats_update_address: Optional[str] = None, @@ -135,35 +135,41 @@ def __init__( target_server_fn: Function to call for each API server process listen_address: Address to listen for client connections sock: Socket for client connections - args: Command line arguments num_servers: Number of API server processes to start + args_per_server: Command line arguments for each API server input_addresses: Input addresses for each API server output_addresses: Output addresses for each API server stats_update_address: Optional stats update address """ + if len(args_per_server) != num_servers: + raise ValueError(f"Incorrect {len(args_per_server)=}") + if len(input_addresses) != num_servers: + raise ValueError(f"Incorrect {len(input_addresses)=}") + if len(output_addresses) != num_servers: + raise ValueError(f"Incorrect {len(output_addresses)=}") + self.listen_address = listen_address self.sock = sock - self.args = args # Start API servers spawn_context = multiprocessing.get_context("spawn") self.processes: list[BaseProcess] = [] - for i, in_addr, out_addr in zip(range(num_servers), input_addresses, - output_addresses): + for i in range(num_servers): client_config = { - "input_address": in_addr, - "output_address": out_addr, + "input_address": input_addresses[i], + "output_address": output_addresses[i], "client_count": num_servers, - "client_index": i + "client_index": i, } if stats_update_address is not None: client_config["stats_update_address"] = stats_update_address - proc = spawn_context.Process(target=target_server_fn, - name=f"ApiServer_{i}", - args=(listen_address, sock, args, - client_config)) + proc = spawn_context.Process( + target=target_server_fn, + name=f"ApiServer_{i}", + args=(listen_address, sock, args_per_server[i], client_config), + ) self.processes.append(proc) proc.start() From 5ff210dc887b454c4cb0e90758d35a50f0259116 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 07:18:13 +0000 Subject: [PATCH 076/130] Tests Signed-off-by: DarkLight1337 --- tests/v1/test_external_lb_dp.py | 34 ++++++++++++++++++++++++---- tests/v1/test_hybrid_lb_dp.py | 36 +++++++++++++++++++++++++----- tests/v1/test_internal_lb_dp.py | 39 +++++++++++++++++++++++++++------ 3 files changed, 93 insertions(+), 16 deletions(-) diff --git a/tests/v1/test_external_lb_dp.py b/tests/v1/test_external_lb_dp.py index 4a5c47fead58..6e7845f407ae 100644 --- a/tests/v1/test_external_lb_dp.py +++ b/tests/v1/test_external_lb_dp.py @@ -9,6 +9,7 @@ import openai # use the official client for correctness check import pytest import pytest_asyncio +import requests from tests.utils import RemoteOpenAIServer from vllm.platforms import current_platform @@ -70,6 +71,8 @@ def start_server(r: int, sargs: list[str]): sargs, auto_port=False, env_dict={ + "VLLM_SERVER_DEV_MODE": + "1", current_platform.device_control_env_var: ",".join( str( @@ -127,11 +130,18 @@ def default_server_args(): @pytest.fixture(scope="module", params=[1, 4]) -def servers(request, default_server_args): +def server_manager(request, default_server_args): api_server_count = request.param - with ExternalLBServerManager(MODEL_NAME, DP_SIZE, api_server_count, - default_server_args) as server_list: - yield server_list + server_manager = ExternalLBServerManager(MODEL_NAME, DP_SIZE, + api_server_count, + default_server_args) + + with server_manager: + yield server_manager + + +def servers(server_manager): + return server_manager.servers @pytest_asyncio.fixture @@ -144,6 +154,22 @@ async def clients(servers: list[tuple[RemoteOpenAIServer, list[str]]]): ] +def test_external_lb_server_info(server_manager): + servers = server_manager.servers + api_server_count = server_manager.api_server_count + + for i, (server, _) in enumerate(servers): + response = requests.get(server.url_for("/server_info")) + response.raise_for_status() + + vllm_config = response.json() + parallel_config = vllm_config["parallel_config"] + + assert parallel_config[ + "api_process_count"] == api_server_count, f"Failed ({i=})" + assert parallel_config["api_process_rank"] == 0, f"Failed ({i=})" + + @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", diff --git a/tests/v1/test_hybrid_lb_dp.py b/tests/v1/test_hybrid_lb_dp.py index 293b1257be6b..1fa8597d4713 100644 --- a/tests/v1/test_hybrid_lb_dp.py +++ b/tests/v1/test_hybrid_lb_dp.py @@ -9,6 +9,7 @@ import openai # use the official client for correctness check import pytest import pytest_asyncio +import requests from tests.utils import RemoteOpenAIServer from tests.v1.test_utils import check_request_balancing @@ -92,6 +93,8 @@ def start_server(node: int, sargs: list[str]): sargs, auto_port=False, env_dict={ + "VLLM_SERVER_DEV_MODE": + "1", current_platform.device_control_env_var: ",".join( str( @@ -150,12 +153,19 @@ def default_server_args(): @pytest.fixture(scope="module", params=[1, 4]) -def servers(request, default_server_args): +def server_manager(request, default_server_args): api_server_count = request.param - with HybridLBServerManager(MODEL_NAME, DP_SIZE, api_server_count, - default_server_args, DP_SIZE_LOCAL, - TP_SIZE) as server_list: - yield server_list + server_manager = HybridLBServerManager(MODEL_NAME, DP_SIZE, + api_server_count, + default_server_args, DP_SIZE_LOCAL, + TP_SIZE) + + with server_manager: + yield server_manager + + +def servers(server_manager): + return server_manager.servers @pytest_asyncio.fixture @@ -168,6 +178,22 @@ async def clients(servers: list[tuple[RemoteOpenAIServer, list[str]]]): ] +def test_hybrid_dp_server_info(server_manager): + servers = server_manager.servers + api_server_count = server_manager.api_server_count + + for i, (server, _) in enumerate(servers): + response = requests.get(server.url_for("/server_info")) + response.raise_for_status() + + vllm_config = response.json() + parallel_config = vllm_config["parallel_config"] + + assert parallel_config[ + "api_process_count"] == api_server_count, f"Failed ({i=})" + assert parallel_config["api_process_rank"] == i, f"Failed ({i=})" + + @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", diff --git a/tests/v1/test_internal_lb_dp.py b/tests/v1/test_internal_lb_dp.py index 2b031865cad7..373a8ab22eab 100644 --- a/tests/v1/test_internal_lb_dp.py +++ b/tests/v1/test_internal_lb_dp.py @@ -10,6 +10,7 @@ import openai # use the official client for correctness check import pytest import pytest_asyncio +import requests from tests.utils import RemoteOpenAIServer from tests.v1.test_utils import check_request_balancing @@ -230,6 +231,8 @@ def start_engines_server(): engines_server_args, auto_port=False, env_dict={ + "VLLM_SERVER_DEV_MODE": + "1", current_platform.device_control_env_var: ",".join( str( @@ -293,14 +296,20 @@ def default_server_args(): @pytest.fixture(scope="module", params=[1, 4]) -def servers(request, default_server_args): +def server_manager(request, default_server_args): api_server_count = request.param - with MultinodeInternalLBServerManager(MODEL_NAME, DP_SIZE, - api_server_count, - default_server_args, - DP_SIZE // NUM_NODES, - TP_SIZE) as server_list: - yield server_list + server_manager = MultinodeInternalLBServerManager(MODEL_NAME, DP_SIZE, + api_server_count, + default_server_args, + DP_SIZE // NUM_NODES, + TP_SIZE) + + with server_manager: + yield server_manager + + +def servers(server_manager): + return server_manager.servers @pytest.fixture(scope="module", params=[1, 4]) @@ -331,6 +340,22 @@ async def api_only_client(api_only_servers: list[tuple[RemoteOpenAIServer, yield client +def test_multinode_dp_server_info(server_manager): + servers = server_manager.servers + api_server_count = server_manager.api_server_count + + for i, (server, _) in enumerate(servers): + response = requests.get(server.url_for("/server_info")) + response.raise_for_status() + + vllm_config = response.json() + parallel_config = vllm_config["parallel_config"] + + assert parallel_config[ + "api_process_count"] == api_server_count, f"Failed ({i=})" + assert parallel_config["api_process_rank"] == i, f"Failed ({i=})" + + @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", From ed761704663772878b3bfa5ed8ed306980992ccc Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 07:24:11 +0000 Subject: [PATCH 077/130] Update Signed-off-by: DarkLight1337 --- tests/v1/test_external_lb_dp.py | 5 ++++- tests/v1/test_hybrid_lb_dp.py | 5 ++++- tests/v1/test_internal_lb_dp.py | 5 ++++- vllm/multimodal/cache.py | 6 +++--- 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/tests/v1/test_external_lb_dp.py b/tests/v1/test_external_lb_dp.py index 6e7845f407ae..7a774ef8a22b 100644 --- a/tests/v1/test_external_lb_dp.py +++ b/tests/v1/test_external_lb_dp.py @@ -159,7 +159,7 @@ def test_external_lb_server_info(server_manager): api_server_count = server_manager.api_server_count for i, (server, _) in enumerate(servers): - response = requests.get(server.url_for("/server_info")) + response = requests.get(server.url_for("server_info")) response.raise_for_status() vllm_config = response.json() @@ -169,6 +169,9 @@ def test_external_lb_server_info(server_manager): "api_process_count"] == api_server_count, f"Failed ({i=})" assert parallel_config["api_process_rank"] == 0, f"Failed ({i=})" + # Logging in case a non-assert exception occurs + print(f"Passed ({i=})") + @pytest.mark.asyncio @pytest.mark.parametrize( diff --git a/tests/v1/test_hybrid_lb_dp.py b/tests/v1/test_hybrid_lb_dp.py index 1fa8597d4713..fa9f645ad2af 100644 --- a/tests/v1/test_hybrid_lb_dp.py +++ b/tests/v1/test_hybrid_lb_dp.py @@ -183,7 +183,7 @@ def test_hybrid_dp_server_info(server_manager): api_server_count = server_manager.api_server_count for i, (server, _) in enumerate(servers): - response = requests.get(server.url_for("/server_info")) + response = requests.get(server.url_for("server_info")) response.raise_for_status() vllm_config = response.json() @@ -193,6 +193,9 @@ def test_hybrid_dp_server_info(server_manager): "api_process_count"] == api_server_count, f"Failed ({i=})" assert parallel_config["api_process_rank"] == i, f"Failed ({i=})" + # Logging in case a non-assert exception occurs + print(f"Passed ({i=})") + @pytest.mark.asyncio @pytest.mark.parametrize( diff --git a/tests/v1/test_internal_lb_dp.py b/tests/v1/test_internal_lb_dp.py index 373a8ab22eab..939e8e060a8e 100644 --- a/tests/v1/test_internal_lb_dp.py +++ b/tests/v1/test_internal_lb_dp.py @@ -345,7 +345,7 @@ def test_multinode_dp_server_info(server_manager): api_server_count = server_manager.api_server_count for i, (server, _) in enumerate(servers): - response = requests.get(server.url_for("/server_info")) + response = requests.get(server.url_for("server_info")) response.raise_for_status() vllm_config = response.json() @@ -355,6 +355,9 @@ def test_multinode_dp_server_info(server_manager): "api_process_count"] == api_server_count, f"Failed ({i=})" assert parallel_config["api_process_rank"] == i, f"Failed ({i=})" + # Logging in case a non-assert exception occurs + print(f"Passed ({i=})") + @pytest.mark.asyncio @pytest.mark.parametrize( diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py index 95765bfbe0f2..6a3a63199a67 100644 --- a/vllm/multimodal/cache.py +++ b/vllm/multimodal/cache.py @@ -380,9 +380,9 @@ def _enable_processor_cache( def _enable_ipc_cache(vllm_config: "VllmConfig") -> bool: parallel_config = vllm_config.parallel_config - supports_ipc_cache = (parallel_config.api_process_count == 1 - and (parallel_config.data_parallel_size == 1 - or parallel_config.data_parallel_external_lb)) + supports_ipc_cache = ((parallel_config.api_process_count == 1 + and parallel_config.data_parallel_size == 1) + or parallel_config.data_parallel_external_lb) return supports_ipc_cache From 90703bd9d62ff4b9419c42af365be350660edbab Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 08:11:22 +0000 Subject: [PATCH 078/130] Update and fix tests Signed-off-by: DarkLight1337 --- tests/v1/test_external_lb_dp.py | 27 ++++++++++++------- tests/v1/test_hybrid_lb_dp.py | 27 ++++++++++++------- tests/v1/test_internal_lb_dp.py | 39 ++++++++++++++++----------- vllm/entrypoints/openai/api_server.py | 21 ++++++++++++--- 4 files changed, 74 insertions(+), 40 deletions(-) diff --git a/tests/v1/test_external_lb_dp.py b/tests/v1/test_external_lb_dp.py index 7a774ef8a22b..7820600738ab 100644 --- a/tests/v1/test_external_lb_dp.py +++ b/tests/v1/test_external_lb_dp.py @@ -154,23 +154,30 @@ async def clients(servers: list[tuple[RemoteOpenAIServer, list[str]]]): ] +def _get_parallel_config(server: RemoteOpenAIServer): + response = requests.get(server.url_for("server_info?config_format=json")) + response.raise_for_status() + + vllm_config = response.json()["vllm_config"] + return vllm_config["parallel_config"] + + def test_external_lb_server_info(server_manager): servers = server_manager.servers api_server_count = server_manager.api_server_count for i, (server, _) in enumerate(servers): - response = requests.get(server.url_for("server_info")) - response.raise_for_status() - - vllm_config = response.json() - parallel_config = vllm_config["parallel_config"] + print(f"Testing {i=}") - assert parallel_config[ - "api_process_count"] == api_server_count, f"Failed ({i=})" - assert parallel_config["api_process_rank"] == 0, f"Failed ({i=})" + # Each request will hit one of the API servers + parallel_configs = [_get_parallel_config(server) for _ in range(50)] + api_process_counts = [c["api_process_count"] for c in parallel_configs] + api_process_ranks = [c["api_process_rank"] for c in parallel_configs] - # Logging in case a non-assert exception occurs - print(f"Passed ({i=})") + assert all(c == api_server_count + for c in api_process_counts), api_process_counts + assert all(0 <= r < api_server_count + for r in api_process_ranks), api_process_ranks @pytest.mark.asyncio diff --git a/tests/v1/test_hybrid_lb_dp.py b/tests/v1/test_hybrid_lb_dp.py index fa9f645ad2af..339abe18c979 100644 --- a/tests/v1/test_hybrid_lb_dp.py +++ b/tests/v1/test_hybrid_lb_dp.py @@ -178,23 +178,30 @@ async def clients(servers: list[tuple[RemoteOpenAIServer, list[str]]]): ] +def _get_parallel_config(server: RemoteOpenAIServer): + response = requests.get(server.url_for("server_info?config_format=json")) + response.raise_for_status() + + vllm_config = response.json()["vllm_config"] + return vllm_config["parallel_config"] + + def test_hybrid_dp_server_info(server_manager): servers = server_manager.servers api_server_count = server_manager.api_server_count for i, (server, _) in enumerate(servers): - response = requests.get(server.url_for("server_info")) - response.raise_for_status() - - vllm_config = response.json() - parallel_config = vllm_config["parallel_config"] + print(f"Testing {i=}") - assert parallel_config[ - "api_process_count"] == api_server_count, f"Failed ({i=})" - assert parallel_config["api_process_rank"] == i, f"Failed ({i=})" + # Each request will hit one of the API servers + parallel_configs = [_get_parallel_config(server) for _ in range(50)] + api_process_counts = [c["api_process_count"] for c in parallel_configs] + api_process_ranks = [c["api_process_rank"] for c in parallel_configs] - # Logging in case a non-assert exception occurs - print(f"Passed ({i=})") + assert all(c == api_server_count + for c in api_process_counts), api_process_counts + assert all(0 <= r < api_server_count + for r in api_process_ranks), api_process_ranks @pytest.mark.asyncio diff --git a/tests/v1/test_internal_lb_dp.py b/tests/v1/test_internal_lb_dp.py index 939e8e060a8e..66f5aca14c57 100644 --- a/tests/v1/test_internal_lb_dp.py +++ b/tests/v1/test_internal_lb_dp.py @@ -102,6 +102,8 @@ def start_server(sidx: int, r: int, sargs: list[str]): sargs, auto_port=False, env_dict={ + "VLLM_SERVER_DEV_MODE": + "1", current_platform.device_control_env_var: ",".join( str( @@ -215,7 +217,10 @@ def start_api_server(): self.model_name, api_server_args, auto_port=False, - env_dict={}) # No GPUs needed for API-only server + env_dict={ + "VLLM_SERVER_DEV_MODE": "1", + # No GPUs needed for API-only server + }) server.__enter__() print(f"API-only server started successfully with " f"{self.api_server_count} API servers") @@ -231,8 +236,6 @@ def start_engines_server(): engines_server_args, auto_port=False, env_dict={ - "VLLM_SERVER_DEV_MODE": - "1", current_platform.device_control_env_var: ",".join( str( @@ -340,23 +343,27 @@ async def api_only_client(api_only_servers: list[tuple[RemoteOpenAIServer, yield client -def test_multinode_dp_server_info(server_manager): - servers = server_manager.servers - api_server_count = server_manager.api_server_count +def _get_parallel_config(server: RemoteOpenAIServer): + response = requests.get(server.url_for("server_info?config_format=json")) + response.raise_for_status() - for i, (server, _) in enumerate(servers): - response = requests.get(server.url_for("server_info")) - response.raise_for_status() + vllm_config = response.json()["vllm_config"] + return vllm_config["parallel_config"] - vllm_config = response.json() - parallel_config = vllm_config["parallel_config"] - assert parallel_config[ - "api_process_count"] == api_server_count, f"Failed ({i=})" - assert parallel_config["api_process_rank"] == i, f"Failed ({i=})" +def test_multinode_dp_server_info(server_manager): + head_server = server_manager.servers[0][0] + api_server_count = server_manager.api_server_count + + # Each request will hit one of the API servers + parallel_configs = [_get_parallel_config(head_server) for _ in range(50)] + api_process_counts = [c["api_process_count"] for c in parallel_configs] + api_process_ranks = [c["api_process_rank"] for c in parallel_configs] - # Logging in case a non-assert exception occurs - print(f"Passed ({i=})") + assert all(c == api_server_count + for c in api_process_counts), api_process_counts + assert all(0 <= r < api_server_count + for r in api_process_ranks), api_process_ranks @pytest.mark.asyncio diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 9a2470649c8d..3a2039eea45f 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -19,13 +19,14 @@ from contextlib import asynccontextmanager from functools import partial from http import HTTPStatus -from typing import Annotated, Any, Callable, Optional +from typing import Annotated, Any, Callable, Literal, Optional import prometheus_client import pydantic import regex as re import uvloop -from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request +from fastapi import (APIRouter, Depends, FastAPI, Form, HTTPException, Query, + Request) from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, Response, StreamingResponse @@ -1038,9 +1039,21 @@ async def do_rerank_v2(request: RerankRequest, raw_request: Request): logger.warning("SECURITY WARNING: Development endpoints are enabled! " "This should NOT be used in production!") + PydanticVllmConfig = pydantic.TypeAdapter(VllmConfig) + @router.get("/server_info") - async def show_server_info(raw_request: Request): - server_info = {"vllm_config": str(raw_request.app.state.vllm_config)} + async def show_server_info( + raw_request: Request, + config_format: Annotated[Literal["text", "json"], + Query()] = "text", + ): + vllm_config: VllmConfig = raw_request.app.state.vllm_config + server_info = { + "vllm_config": + str(vllm_config) + if config_format == "text" else PydanticVllmConfig.dump_python( + vllm_config, mode="json", fallback=str) + } return JSONResponse(content=server_info) @router.post("/reset_prefix_cache") From 3f97be411d5eaa0da98b7febae44db5092e9cd01 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 08:19:49 +0000 Subject: [PATCH 079/130] Update docstring Signed-off-by: DarkLight1337 --- vllm/config/parallel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index b7f7231ef6da..a3aba2d01df8 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -98,9 +98,9 @@ class ParallelConfig: --data-parallel-start-rank.""" api_process_count: int = 1 - """[Internal] The number of API processes initialized.""" + """[Internal CLI arg] The number of API processes initialized.""" api_process_rank: int = 0 - """[Internal] The rank of this API process.""" + """[Internal CLI arg] The rank of this API process.""" enable_expert_parallel: bool = False """Use expert parallelism instead of tensor parallelism for MoE layers.""" From 91ea959ad379b556b0032cf63ccc6ca8831a9b75 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 08:30:43 +0000 Subject: [PATCH 080/130] Optimize Signed-off-by: DarkLight1337 --- tests/v1/test_external_lb_dp.py | 7 ++++++- tests/v1/test_hybrid_lb_dp.py | 7 ++++++- tests/v1/test_internal_lb_dp.py | 7 ++++++- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/tests/v1/test_external_lb_dp.py b/tests/v1/test_external_lb_dp.py index 7820600738ab..328079b55519 100644 --- a/tests/v1/test_external_lb_dp.py +++ b/tests/v1/test_external_lb_dp.py @@ -170,7 +170,12 @@ def test_external_lb_server_info(server_manager): print(f"Testing {i=}") # Each request will hit one of the API servers - parallel_configs = [_get_parallel_config(server) for _ in range(50)] + # `n_reqs` is set so that there is a good chance each server + # receives at least one request + n_reqs = 2 * api_server_count * api_server_count + parallel_configs = [ + _get_parallel_config(server) for _ in range(n_reqs) + ] api_process_counts = [c["api_process_count"] for c in parallel_configs] api_process_ranks = [c["api_process_rank"] for c in parallel_configs] diff --git a/tests/v1/test_hybrid_lb_dp.py b/tests/v1/test_hybrid_lb_dp.py index 339abe18c979..f09e4dedddae 100644 --- a/tests/v1/test_hybrid_lb_dp.py +++ b/tests/v1/test_hybrid_lb_dp.py @@ -194,7 +194,12 @@ def test_hybrid_dp_server_info(server_manager): print(f"Testing {i=}") # Each request will hit one of the API servers - parallel_configs = [_get_parallel_config(server) for _ in range(50)] + # `n_reqs` is set so that there is a good chance each server + # receives at least one request + n_reqs = 2 * api_server_count * api_server_count + parallel_configs = [ + _get_parallel_config(server) for _ in range(n_reqs) + ] api_process_counts = [c["api_process_count"] for c in parallel_configs] api_process_ranks = [c["api_process_rank"] for c in parallel_configs] diff --git a/tests/v1/test_internal_lb_dp.py b/tests/v1/test_internal_lb_dp.py index 66f5aca14c57..ead6b3f2c6ec 100644 --- a/tests/v1/test_internal_lb_dp.py +++ b/tests/v1/test_internal_lb_dp.py @@ -356,7 +356,12 @@ def test_multinode_dp_server_info(server_manager): api_server_count = server_manager.api_server_count # Each request will hit one of the API servers - parallel_configs = [_get_parallel_config(head_server) for _ in range(50)] + # `n_reqs` is set so that there is a good chance each server + # receives at least one request + n_reqs = 2 * api_server_count * api_server_count + parallel_configs = [ + _get_parallel_config(head_server) for _ in range(n_reqs) + ] api_process_counts = [c["api_process_count"] for c in parallel_configs] api_process_ranks = [c["api_process_rank"] for c in parallel_configs] From 6d0c0408833341c73102563ec6e6602823705761 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 08:32:50 +0000 Subject: [PATCH 081/130] Comment Signed-off-by: DarkLight1337 --- vllm/entrypoints/openai/api_server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 3a2039eea45f..6fdf1261eea7 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1053,6 +1053,7 @@ async def show_server_info( str(vllm_config) if config_format == "text" else PydanticVllmConfig.dump_python( vllm_config, mode="json", fallback=str) + # fallback=str is needed to handle e.g. torch.dtype } return JSONResponse(content=server_info) From 69c9ff09fdad12f0ff5471ec8edb03907acfed25 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 08:39:38 +0000 Subject: [PATCH 082/130] Improve error message Signed-off-by: DarkLight1337 --- vllm/v1/utils.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index fb8b1ba4ac2a..841ffa2c9da8 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -142,11 +142,14 @@ def __init__( stats_update_address: Optional stats update address """ if len(args_per_server) != num_servers: - raise ValueError(f"Incorrect {len(args_per_server)=}") + raise ValueError(f"Incorrect {len(args_per_server)=}, " + f"expected {num_servers}") if len(input_addresses) != num_servers: - raise ValueError(f"Incorrect {len(input_addresses)=}") + raise ValueError(f"Incorrect {len(input_addresses)=}, " + f"expected {num_servers}") if len(output_addresses) != num_servers: - raise ValueError(f"Incorrect {len(output_addresses)=}") + raise ValueError(f"Incorrect {len(output_addresses)=}, " + f"expected {num_servers}") self.listen_address = listen_address self.sock = sock From 8e3ea32495710e7b067393510974daedcb72f083 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 08:43:21 +0000 Subject: [PATCH 083/130] Update docstring Signed-off-by: DarkLight1337 --- vllm/config/parallel.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index a3aba2d01df8..05561933c890 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -98,9 +98,21 @@ class ParallelConfig: --data-parallel-start-rank.""" api_process_count: int = 1 - """[Internal CLI arg] The number of API processes initialized.""" + """ + The number of API processes initialized. + + Note: + This is an internal config that should only be set by API server + scale-out. + """ api_process_rank: int = 0 - """[Internal CLI arg] The rank of this API process.""" + """ + The rank of this API process. + + Note: + This is an internal config that should only be set by API server + scale-out. + """ enable_expert_parallel: bool = False """Use expert parallelism instead of tensor parallelism for MoE layers.""" From 1dd58943ceb3af508f67689d4ac63a74387c4dba Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 10:04:59 +0000 Subject: [PATCH 084/130] Fixture Signed-off-by: DarkLight1337 --- tests/v1/test_external_lb_dp.py | 1 + tests/v1/test_hybrid_lb_dp.py | 1 + tests/v1/test_internal_lb_dp.py | 1 + 3 files changed, 3 insertions(+) diff --git a/tests/v1/test_external_lb_dp.py b/tests/v1/test_external_lb_dp.py index 328079b55519..54e00c889b8e 100644 --- a/tests/v1/test_external_lb_dp.py +++ b/tests/v1/test_external_lb_dp.py @@ -140,6 +140,7 @@ def server_manager(request, default_server_args): yield server_manager +@pytest.fixture def servers(server_manager): return server_manager.servers diff --git a/tests/v1/test_hybrid_lb_dp.py b/tests/v1/test_hybrid_lb_dp.py index f09e4dedddae..3c35ea039ed4 100644 --- a/tests/v1/test_hybrid_lb_dp.py +++ b/tests/v1/test_hybrid_lb_dp.py @@ -164,6 +164,7 @@ def server_manager(request, default_server_args): yield server_manager +@pytest.fixture def servers(server_manager): return server_manager.servers diff --git a/tests/v1/test_internal_lb_dp.py b/tests/v1/test_internal_lb_dp.py index ead6b3f2c6ec..e67c2af5a12b 100644 --- a/tests/v1/test_internal_lb_dp.py +++ b/tests/v1/test_internal_lb_dp.py @@ -311,6 +311,7 @@ def server_manager(request, default_server_args): yield server_manager +@pytest.fixture def servers(server_manager): return server_manager.servers From d06434faa86b6048e4eedae59b83839025c84643 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 15:51:18 +0000 Subject: [PATCH 085/130] Address comments in serve.py Signed-off-by: DarkLight1337 --- .../test_api_server_process_manager.py | 4 +-- vllm/entrypoints/cli/serve.py | 18 ++++------ vllm/v1/utils.py | 33 +++++++------------ 3 files changed, 20 insertions(+), 35 deletions(-) diff --git a/tests/entrypoints/test_api_server_process_manager.py b/tests/entrypoints/test_api_server_process_manager.py index 882382a38543..a39ba6402668 100644 --- a/tests/entrypoints/test_api_server_process_manager.py +++ b/tests/entrypoints/test_api_server_process_manager.py @@ -36,10 +36,10 @@ def api_server_args(): "localhost:8000", "sock": sock, + "args": + "test_args", # Simple string to avoid pickling issues "num_servers": 3, - "args_per_server": - ["test_args"] * 3, # Simple string to avoid pickling issues "input_addresses": [ "tcp://127.0.0.1:5001", "tcp://127.0.0.1:5002", "tcp://127.0.0.1:5003" diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 2ebffe9cb690..216bcfb1da40 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -3,7 +3,6 @@ import argparse import signal -from copy import deepcopy from typing import Optional import uvloop @@ -140,14 +139,11 @@ def run_multi_api_server(args: argparse.Namespace): assert num_api_servers > 0 # No need to set api_process_rank for EngineCore processes - args.api_process_count = args.api_server_count + args.api_process_count = num_api_servers if num_api_servers > 1: setup_multiprocess_prometheus() - # Not compatible with API server scale-out - args.mm_processor_cache_gb = 0 - listen_address, sock = setup_server(args) engine_args = vllm.AsyncEngineArgs.from_cli_args(args) @@ -171,11 +167,6 @@ def run_multi_api_server(args: argparse.Namespace): hybrid_dp_lb = parallel_config.data_parallel_hybrid_lb assert external_dp_lb or hybrid_dp_lb or dp_rank == 0 - # Set api_process_rank for API server processes - args_per_server = [deepcopy(args) for _ in range(num_api_servers)] - for server_idx in range(num_api_servers): - args_per_server[server_idx].api_process_rank = server_idx - api_server_manager: Optional[APIServerProcessManager] = None with launch_core_engines(vllm_config, executor_class, log_stats, @@ -187,7 +178,7 @@ def run_multi_api_server(args: argparse.Namespace): target_server_fn=run_api_server_worker_proc, listen_address=listen_address, sock=sock, - args_per_server=args_per_server, + args=args, num_servers=num_api_servers, input_addresses=addresses.inputs, output_addresses=addresses.outputs, @@ -223,9 +214,12 @@ def run_api_server_worker_proc(listen_address, client_config=None, **uvicorn_kwargs) -> None: """Entrypoint for individual API server worker processes.""" + client_config = client_config or {} + + args.api_process_rank = server_index = client_config.get("client_index", 0) + args.api_process_count = client_config.get("client_count", 1) # Set process title and add process-specific prefix to stdout and stderr. - server_index = client_config.get("client_index", 0) if client_config else 0 set_process_title("APIServer", str(server_index)) decorate_logs() diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 841ffa2c9da8..03fe8a794f56 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -123,8 +123,8 @@ def __init__( target_server_fn: Callable, listen_address: str, sock: Any, + args: argparse.Namespace, num_servers: int, - args_per_server: list[argparse.Namespace], input_addresses: list[str], output_addresses: list[str], stats_update_address: Optional[str] = None, @@ -135,44 +135,35 @@ def __init__( target_server_fn: Function to call for each API server process listen_address: Address to listen for client connections sock: Socket for client connections - num_servers: Number of API server processes to start args_per_server: Command line arguments for each API server + num_servers: Number of API server processes to start input_addresses: Input addresses for each API server output_addresses: Output addresses for each API server stats_update_address: Optional stats update address """ - if len(args_per_server) != num_servers: - raise ValueError(f"Incorrect {len(args_per_server)=}, " - f"expected {num_servers}") - if len(input_addresses) != num_servers: - raise ValueError(f"Incorrect {len(input_addresses)=}, " - f"expected {num_servers}") - if len(output_addresses) != num_servers: - raise ValueError(f"Incorrect {len(output_addresses)=}, " - f"expected {num_servers}") - self.listen_address = listen_address self.sock = sock + self.args = args # Start API servers spawn_context = multiprocessing.get_context("spawn") self.processes: list[BaseProcess] = [] - for i in range(num_servers): + for i, in_addr, out_addr in zip(range(num_servers), input_addresses, + output_addresses): client_config = { - "input_address": input_addresses[i], - "output_address": output_addresses[i], + "input_address": in_addr, + "output_address": out_addr, "client_count": num_servers, - "client_index": i, + "client_index": i } if stats_update_address is not None: client_config["stats_update_address"] = stats_update_address - proc = spawn_context.Process( - target=target_server_fn, - name=f"ApiServer_{i}", - args=(listen_address, sock, args_per_server[i], client_config), - ) + proc = spawn_context.Process(target=target_server_fn, + name=f"ApiServer_{i}", + args=(listen_address, sock, args, + client_config)) self.processes.append(proc) proc.start() From dac11709edb1f7b03e7609c15b71b953d3ac5136 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 16:04:49 +0000 Subject: [PATCH 086/130] Rename attributes to internal and validate Signed-off-by: DarkLight1337 --- tests/v1/test_external_lb_dp.py | 6 +++-- tests/v1/test_hybrid_lb_dp.py | 6 +++-- tests/v1/test_internal_lb_dp.py | 4 +-- vllm/config/parallel.py | 48 +++++++++++++++++++++------------ vllm/engine/arg_utils.py | 8 +++--- vllm/entrypoints/cli/serve.py | 9 ++++--- vllm/multimodal/cache.py | 2 +- 7 files changed, 51 insertions(+), 32 deletions(-) diff --git a/tests/v1/test_external_lb_dp.py b/tests/v1/test_external_lb_dp.py index 54e00c889b8e..862a76f3c4e2 100644 --- a/tests/v1/test_external_lb_dp.py +++ b/tests/v1/test_external_lb_dp.py @@ -177,8 +177,10 @@ def test_external_lb_server_info(server_manager): parallel_configs = [ _get_parallel_config(server) for _ in range(n_reqs) ] - api_process_counts = [c["api_process_count"] for c in parallel_configs] - api_process_ranks = [c["api_process_rank"] for c in parallel_configs] + api_process_counts = [ + c["_api_process_count"] for c in parallel_configs + ] + api_process_ranks = [c["_api_process_rank"] for c in parallel_configs] assert all(c == api_server_count for c in api_process_counts), api_process_counts diff --git a/tests/v1/test_hybrid_lb_dp.py b/tests/v1/test_hybrid_lb_dp.py index 3c35ea039ed4..552436f818d7 100644 --- a/tests/v1/test_hybrid_lb_dp.py +++ b/tests/v1/test_hybrid_lb_dp.py @@ -201,8 +201,10 @@ def test_hybrid_dp_server_info(server_manager): parallel_configs = [ _get_parallel_config(server) for _ in range(n_reqs) ] - api_process_counts = [c["api_process_count"] for c in parallel_configs] - api_process_ranks = [c["api_process_rank"] for c in parallel_configs] + api_process_counts = [ + c["_api_process_count"] for c in parallel_configs + ] + api_process_ranks = [c["_api_process_rank"] for c in parallel_configs] assert all(c == api_server_count for c in api_process_counts), api_process_counts diff --git a/tests/v1/test_internal_lb_dp.py b/tests/v1/test_internal_lb_dp.py index e67c2af5a12b..e965645711ee 100644 --- a/tests/v1/test_internal_lb_dp.py +++ b/tests/v1/test_internal_lb_dp.py @@ -363,8 +363,8 @@ def test_multinode_dp_server_info(server_manager): parallel_configs = [ _get_parallel_config(head_server) for _ in range(n_reqs) ] - api_process_counts = [c["api_process_count"] for c in parallel_configs] - api_process_ranks = [c["api_process_rank"] for c in parallel_configs] + api_process_counts = [c["_api_process_count"] for c in parallel_configs] + api_process_ranks = [c["_api_process_rank"] for c in parallel_configs] assert all(c == api_server_count for c in api_process_counts), api_process_counts diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 05561933c890..4405b7fa5d5a 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -97,23 +97,6 @@ class ParallelConfig: between vLLM nodes/replicas. Set explicitly in conjunction with --data-parallel-start-rank.""" - api_process_count: int = 1 - """ - The number of API processes initialized. - - Note: - This is an internal config that should only be set by API server - scale-out. - """ - api_process_rank: int = 0 - """ - The rank of this API process. - - Note: - This is an internal config that should only be set by API server - scale-out. - """ - enable_expert_parallel: bool = False """Use expert parallelism instead of tensor parallelism for MoE layers.""" enable_eplb: bool = False @@ -188,6 +171,26 @@ class is dynamically inherited by the worker class. This is used to inject Set to be private as it's not intended to be configured by users. """ + _api_process_count: int = -1 + """ + The number of API processes initialized, or `-1` if API server scale-out + is not used. + + Note: + This is an internal config that is only valid for and + should only be set by API server scale-out. + """ + _api_process_rank: int = -1 + """ + The rank of this API process, or `-1` if API server scale-out + is not used. It is also `-1` for engine core processes + under API server scale-out. + + Note: + This is an internal config that is only valid for and + should only be set by API server scale-out. + """ + @property def world_size_across_dp(self) -> int: """world_size_across_dp is TPxPPxDP, it is the size of the world @@ -424,6 +427,17 @@ def __post_init__(self) -> None: if self.distributed_executor_backend is None and self.world_size == 1: self.distributed_executor_backend = "uni" + if self._api_process_count == -1 and self._api_process_rank != -1: + raise ValueError("`_api_process_rank` is an internal config " + "and should not be set by users") + + if (self._api_process_count != -1 and + not -1 <= self._api_process_rank < self._api_process_count): + raise ValueError( + "Invalid value of `_api_process_rank`. " + f"Expected to be `-1` or `[0, {self._api_process_count})`, " + f"but found: {self._api_process_rank}") + @property def use_ray(self) -> bool: return self.distributed_executor_backend == "ray" or ( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 91c98f6e64b2..1ec322ba4f86 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -303,11 +303,11 @@ class EngineArgs: data_parallel_rpc_port: Optional[int] = None data_parallel_hybrid_lb: bool = False data_parallel_backend: str = ParallelConfig.data_parallel_backend - api_process_count: int = ParallelConfig.api_process_count - api_process_rank: int = ParallelConfig.api_process_rank enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config") enable_eplb: bool = ParallelConfig.enable_eplb + _api_process_count: int = ParallelConfig._api_process_count + _api_process_rank: int = ParallelConfig._api_process_rank num_redundant_experts: int = EPLBConfig.num_redundant_experts eplb_window_size: int = EPLBConfig.window_size eplb_step_interval: int = EPLBConfig.step_interval @@ -1285,8 +1285,6 @@ def create_engine_config( data_parallel_rpc_port=data_parallel_rpc_port, data_parallel_backend=self.data_parallel_backend, data_parallel_hybrid_lb=self.data_parallel_hybrid_lb, - api_process_count=self.api_process_count, - api_process_rank=self.api_process_rank, enable_expert_parallel=self.enable_expert_parallel, enable_eplb=self.enable_eplb, eplb_config=self.eplb_config, @@ -1298,6 +1296,8 @@ def create_engine_config( distributed_executor_backend=self.distributed_executor_backend, worker_cls=self.worker_cls, worker_extension_cls=self.worker_extension_cls, + _api_process_count=self._api_process_count, + _api_process_rank=self._api_process_rank, ) speculative_config = self.create_speculative_config( diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 216bcfb1da40..c1ae9726bbec 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -138,8 +138,8 @@ def run_multi_api_server(args: argparse.Namespace): num_api_servers: int = args.api_server_count assert num_api_servers > 0 - # No need to set api_process_rank for EngineCore processes - args.api_process_count = num_api_servers + args._api_process_count = num_api_servers + args._api_process_rank = -1 if num_api_servers > 1: setup_multiprocess_prometheus() @@ -216,8 +216,9 @@ def run_api_server_worker_proc(listen_address, """Entrypoint for individual API server worker processes.""" client_config = client_config or {} - args.api_process_rank = server_index = client_config.get("client_index", 0) - args.api_process_count = client_config.get("client_count", 1) + args._api_process_rank = server_index = client_config.get( + "client_index", 0) + args._api_process_count = client_config.get("client_count", 1) # Set process title and add process-specific prefix to stdout and stderr. set_process_title("APIServer", str(server_index)) diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py index 6a3a63199a67..b4e368147fda 100644 --- a/vllm/multimodal/cache.py +++ b/vllm/multimodal/cache.py @@ -380,7 +380,7 @@ def _enable_processor_cache( def _enable_ipc_cache(vllm_config: "VllmConfig") -> bool: parallel_config = vllm_config.parallel_config - supports_ipc_cache = ((parallel_config.api_process_count == 1 + supports_ipc_cache = ((parallel_config._api_process_count == 1 and parallel_config.data_parallel_size == 1) or parallel_config.data_parallel_external_lb) From 3f62e0eb28c7495892156155361b303104560f7c Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 16:07:36 +0000 Subject: [PATCH 087/130] Fix Signed-off-by: DarkLight1337 --- vllm/config/parallel.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 4405b7fa5d5a..24971147eaaa 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -171,15 +171,15 @@ class is dynamically inherited by the worker class. This is used to inject Set to be private as it's not intended to be configured by users. """ - _api_process_count: int = -1 + _api_process_count: int = 1 """ - The number of API processes initialized, or `-1` if API server scale-out - is not used. + The number of API processes initialized. Note: This is an internal config that is only valid for and should only be set by API server scale-out. """ + _api_process_rank: int = -1 """ The rank of this API process, or `-1` if API server scale-out @@ -427,11 +427,11 @@ def __post_init__(self) -> None: if self.distributed_executor_backend is None and self.world_size == 1: self.distributed_executor_backend = "uni" - if self._api_process_count == -1 and self._api_process_rank != -1: + if self._api_process_count == 1 and self._api_process_rank != -1: raise ValueError("`_api_process_rank` is an internal config " "and should not be set by users") - if (self._api_process_count != -1 and + if (self._api_process_count > 1 and not -1 <= self._api_process_rank < self._api_process_count): raise ValueError( "Invalid value of `_api_process_rank`. " From df9f9cbc5bf0834e5fa0eb46cc4b099ced216d38 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 16:15:11 +0000 Subject: [PATCH 088/130] Update Signed-off-by: DarkLight1337 --- vllm/entrypoints/cli/serve.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index c1ae9726bbec..345ca9bad009 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -216,12 +216,11 @@ def run_api_server_worker_proc(listen_address, """Entrypoint for individual API server worker processes.""" client_config = client_config or {} - args._api_process_rank = server_index = client_config.get( - "client_index", 0) args._api_process_count = client_config.get("client_count", 1) + args._api_process_rank = client_config.get("client_index", 0) # Set process title and add process-specific prefix to stdout and stderr. - set_process_title("APIServer", str(server_index)) + set_process_title("APIServer", str(args._api_process_rank)) decorate_logs() uvloop.run( From 0ec4e66776583722dc198765f75c424b1b44a25f Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 16:18:15 +0000 Subject: [PATCH 089/130] Push down Signed-off-by: DarkLight1337 --- vllm/entrypoints/cli/serve.py | 5 ++--- vllm/entrypoints/openai/api_server.py | 10 +++++++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 345ca9bad009..ccab442b72ce 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -216,11 +216,10 @@ def run_api_server_worker_proc(listen_address, """Entrypoint for individual API server worker processes.""" client_config = client_config or {} - args._api_process_count = client_config.get("client_count", 1) - args._api_process_rank = client_config.get("client_index", 0) + server_index = client_config.get("client_index", 0) # Set process title and add process-specific prefix to stdout and stderr. - set_process_title("APIServer", str(args._api_process_rank)) + set_process_title("APIServer", str(server_index)) decorate_logs() uvloop.run( diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 6fdf1261eea7..6bc5aab22db6 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1950,12 +1950,14 @@ async def run_server_worker(listen_address, client_config=None, **uvicorn_kwargs) -> None: """Run a single API server worker.""" + client_config = client_config or {} + + args._api_process_count = client_config.get("client_count", 1) + args._api_process_rank = client_config.get("client_index", 0) if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3: ToolParserManager.import_tool_parser(args.tool_parser_plugin) - server_index = client_config.get("client_index", 0) if client_config else 0 - # Load logging config for uvicorn if specified log_config = load_log_config(args.log_config_file) if log_config is not None: @@ -1971,7 +1973,9 @@ async def run_server_worker(listen_address, vllm_config = await engine_client.get_vllm_config() await init_app_state(engine_client, vllm_config, app.state, args) - logger.info("Starting vLLM API server %d on %s", server_index, + print(vllm_config) + logger.info("Starting vLLM API server %d/%d on %s", + args._api_process_rank, args._api_server_count, listen_address) shutdown_task = await serve_http( app, From e500a9bce4584ae0188986a21c93198376342075 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 16:20:48 +0000 Subject: [PATCH 090/130] Update Signed-off-by: DarkLight1337 --- vllm/config/parallel.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 24971147eaaa..814cef03f987 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -180,10 +180,9 @@ class is dynamically inherited by the worker class. This is used to inject should only be set by API server scale-out. """ - _api_process_rank: int = -1 + _api_process_rank: int = 0 """ - The rank of this API process, or `-1` if API server scale-out - is not used. It is also `-1` for engine core processes + The rank of this API process, or `-1` for engine core processes under API server scale-out. Note: @@ -427,10 +426,6 @@ def __post_init__(self) -> None: if self.distributed_executor_backend is None and self.world_size == 1: self.distributed_executor_backend = "uni" - if self._api_process_count == 1 and self._api_process_rank != -1: - raise ValueError("`_api_process_rank` is an internal config " - "and should not be set by users") - if (self._api_process_count > 1 and not -1 <= self._api_process_rank < self._api_process_count): raise ValueError( From 36fb875c092e77f0b8aa701d14d050a86cd4a500 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 16:23:32 +0000 Subject: [PATCH 091/130] Fix Signed-off-by: DarkLight1337 --- vllm/v1/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 03fe8a794f56..b5750c82db02 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -135,7 +135,7 @@ def __init__( target_server_fn: Function to call for each API server process listen_address: Address to listen for client connections sock: Socket for client connections - args_per_server: Command line arguments for each API server + args: Command line arguments num_servers: Number of API server processes to start input_addresses: Input addresses for each API server output_addresses: Output addresses for each API server From e08e7b788092265b56c382caf1c8e8682be7dbb3 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 16:25:03 +0000 Subject: [PATCH 092/130] Try deepcopy Signed-off-by: DarkLight1337 --- vllm/entrypoints/cli/serve.py | 1 - vllm/entrypoints/openai/api_server.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index ccab442b72ce..0d2b0e5057c9 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -215,7 +215,6 @@ def run_api_server_worker_proc(listen_address, **uvicorn_kwargs) -> None: """Entrypoint for individual API server worker processes.""" client_config = client_config or {} - server_index = client_config.get("client_index", 0) # Set process title and add process-specific prefix to stdout and stderr. diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 6bc5aab22db6..03baae1befb1 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -17,6 +17,7 @@ from argparse import Namespace from collections.abc import AsyncIterator, Awaitable from contextlib import asynccontextmanager +from copy import deepcopy from functools import partial from http import HTTPStatus from typing import Annotated, Any, Callable, Literal, Optional @@ -1952,6 +1953,7 @@ async def run_server_worker(listen_address, """Run a single API server worker.""" client_config = client_config or {} + args = deepcopy(args) args._api_process_count = client_config.get("client_count", 1) args._api_process_rank = client_config.get("client_index", 0) From 875c7e3ae6f49eba4945020b5c8ec90c562cff91 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 16:26:03 +0000 Subject: [PATCH 093/130] No print Signed-off-by: DarkLight1337 --- vllm/entrypoints/openai/api_server.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 03baae1befb1..646dfe35abad 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1975,7 +1975,6 @@ async def run_server_worker(listen_address, vllm_config = await engine_client.get_vllm_config() await init_app_state(engine_client, vllm_config, app.state, args) - print(vllm_config) logger.info("Starting vLLM API server %d/%d on %s", args._api_process_rank, args._api_server_count, listen_address) From d9a5c81993ba9cd04868665332b88fa74459ea73 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 16:27:51 +0000 Subject: [PATCH 094/130] Simplify Signed-off-by: DarkLight1337 --- vllm/config/parallel.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 814cef03f987..f822a6901b22 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -96,7 +96,6 @@ class ParallelConfig: between local data parallel ranks, but an external LB balances between vLLM nodes/replicas. Set explicitly in conjunction with --data-parallel-start-rank.""" - enable_expert_parallel: bool = False """Use expert parallelism instead of tensor parallelism for MoE layers.""" enable_eplb: bool = False @@ -426,8 +425,7 @@ def __post_init__(self) -> None: if self.distributed_executor_backend is None and self.world_size == 1: self.distributed_executor_backend = "uni" - if (self._api_process_count > 1 and - not -1 <= self._api_process_rank < self._api_process_count): + if not -1 <= self._api_process_rank < self._api_process_count: raise ValueError( "Invalid value of `_api_process_rank`. " f"Expected to be `-1` or `[0, {self._api_process_count})`, " From dabe421ede267bcf3c8b1794ba034ed6e2009c66 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 16:39:53 +0000 Subject: [PATCH 095/130] Fix Signed-off-by: DarkLight1337 --- vllm/entrypoints/openai/api_server.py | 22 ++++++++++------------ vllm/v1/engine/core_client.py | 2 +- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 646dfe35abad..db4e77be0311 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -17,7 +17,6 @@ from argparse import Namespace from collections.abc import AsyncIterator, Awaitable from contextlib import asynccontextmanager -from copy import deepcopy from functools import partial from http import HTTPStatus from typing import Annotated, Any, Callable, Literal, Optional @@ -172,6 +171,9 @@ async def build_async_engine_client( # Context manager to handle engine_client lifecycle # Ensures everything is shutdown and cleaned up on error/exit engine_args = AsyncEngineArgs.from_cli_args(args) + if client_config: + engine_args._api_process_count = client_config.get("client_count", 1) + engine_args._api_process_rank = client_config.get("client_index", 0) if disable_frontend_multiprocessing is None: disable_frontend_multiprocessing = bool( @@ -201,6 +203,7 @@ async def build_async_engine_client_from_engine_args( Returns the Client or None if the creation failed. """ + client_config = dict(client_config) if client_config else {} # Create the EngineConfig (determines if we can use V1). vllm_config = engine_args.create_engine_config(usage_context=usage_context) @@ -214,10 +217,10 @@ async def build_async_engine_client_from_engine_args( from vllm.v1.engine.async_llm import AsyncLLM async_llm: Optional[AsyncLLM] = None - client_count = client_config.pop( - "client_count") if client_config else 1 - client_index = client_config.pop( - "client_index") if client_config else 0 + print("client_config before", client_config) + client_count = client_config.pop("client_count", 1) + client_index = client_config.pop("client_index", 0) + print("client_config after", client_config) try: async_llm = AsyncLLM.from_vllm_config( vllm_config=vllm_config, @@ -1951,11 +1954,6 @@ async def run_server_worker(listen_address, client_config=None, **uvicorn_kwargs) -> None: """Run a single API server worker.""" - client_config = client_config or {} - - args = deepcopy(args) - args._api_process_count = client_config.get("client_count", 1) - args._api_process_rank = client_config.get("client_index", 0) if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3: ToolParserManager.import_tool_parser(args.tool_parser_plugin) @@ -1975,8 +1973,8 @@ async def run_server_worker(listen_address, vllm_config = await engine_client.get_vllm_config() await init_app_state(engine_client, vllm_config, app.state, args) - logger.info("Starting vLLM API server %d/%d on %s", - args._api_process_rank, args._api_server_count, + logger.info("Starting vLLM API server %d on %s", + vllm_config.parallel_config._api_process_rank, listen_address) shutdown_task = await serve_http( app, diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 54231cebea20..b95f4175f50c 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -435,7 +435,7 @@ def __init__( self.engines_running = False self.stats_update_address: Optional[str] = None - if client_addresses is not None: + if client_addresses: # Engines are managed externally to this client. input_address = client_addresses["input_address"] output_address = client_addresses["output_address"] From fdc9b6e43e5e621f94c5ff944a29b9d4e4f7a65f Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 16:41:52 +0000 Subject: [PATCH 096/130] Update Signed-off-by: DarkLight1337 --- vllm/entrypoints/openai/api_server.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index db4e77be0311..c167ecd9833f 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -203,7 +203,6 @@ async def build_async_engine_client_from_engine_args( Returns the Client or None if the creation failed. """ - client_config = dict(client_config) if client_config else {} # Create the EngineConfig (determines if we can use V1). vllm_config = engine_args.create_engine_config(usage_context=usage_context) @@ -217,10 +216,12 @@ async def build_async_engine_client_from_engine_args( from vllm.v1.engine.async_llm import AsyncLLM async_llm: Optional[AsyncLLM] = None - print("client_config before", client_config) + + # Don't mutate the input client_config + client_config = dict(client_config) if client_config else {} client_count = client_config.pop("client_count", 1) client_index = client_config.pop("client_index", 0) - print("client_config after", client_config) + try: async_llm = AsyncLLM.from_vllm_config( vllm_config=vllm_config, From 94ec51d4401603c6e45ed2e1b445e30e9dcbb79f Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 16:45:24 +0000 Subject: [PATCH 097/130] Type checking Signed-off-by: DarkLight1337 --- vllm/entrypoints/cli/serve.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 0d2b0e5057c9..de47bf00932e 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -138,15 +138,15 @@ def run_multi_api_server(args: argparse.Namespace): num_api_servers: int = args.api_server_count assert num_api_servers > 0 - args._api_process_count = num_api_servers - args._api_process_rank = -1 - if num_api_servers > 1: setup_multiprocess_prometheus() listen_address, sock = setup_server(args) engine_args = vllm.AsyncEngineArgs.from_cli_args(args) + engine_args._api_process_count = num_api_servers + engine_args._api_process_rank = -1 + usage_context = UsageContext.OPENAI_API_SERVER vllm_config = engine_args.create_engine_config(usage_context=usage_context) From 013605dc1b6328493044ffdf4c74cde23d49b15c Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 16:57:45 +0000 Subject: [PATCH 098/130] Less diff Signed-off-by: DarkLight1337 --- vllm/config/parallel.py | 6 ------ vllm/engine/arg_utils.py | 2 -- vllm/entrypoints/cli/serve.py | 4 ++-- 3 files changed, 2 insertions(+), 10 deletions(-) diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 2ef9d471d2b0..f822a6901b22 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -96,12 +96,6 @@ class ParallelConfig: between local data parallel ranks, but an external LB balances between vLLM nodes/replicas. Set explicitly in conjunction with --data-parallel-start-rank.""" - - api_process_count: int = 1 - """[Internal] The number of API processes initialized.""" - api_process_rank: int = 0 - """[Internal] The rank of this API process.""" - enable_expert_parallel: bool = False """Use expert parallelism instead of tensor parallelism for MoE layers.""" enable_eplb: bool = False diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index bd3a99ca3519..bcbd4280a21b 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1287,8 +1287,6 @@ def create_engine_config( data_parallel_rpc_port=data_parallel_rpc_port, data_parallel_backend=self.data_parallel_backend, data_parallel_hybrid_lb=self.data_parallel_hybrid_lb, - api_process_count=self.api_process_count, - api_process_rank=self.api_process_rank, enable_expert_parallel=self.enable_expert_parallel, enable_eplb=self.enable_eplb, eplb_config=self.eplb_config, diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index c25a8d20b22f..de47bf00932e 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -3,7 +3,7 @@ import argparse import signal -from typing import Any, Optional +from typing import Optional import uvloop @@ -174,7 +174,7 @@ def run_multi_api_server(args: argparse.Namespace): coordinator, addresses): # Construct common args for the APIServerProcessManager up-front. - api_server_manager_kwargs = dict[str, Any]( + api_server_manager_kwargs = dict( target_server_fn=run_api_server_worker_proc, listen_address=listen_address, sock=sock, From 22914dbb7eef2db99ad1d15b8e366ba7258ae869 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 17:07:33 +0000 Subject: [PATCH 099/130] Fix Signed-off-by: DarkLight1337 --- vllm/engine/arg_utils.py | 2 -- vllm/v1/engine/processor.py | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index e956712229d2..c2b786f16322 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -314,8 +314,6 @@ class EngineArgs: data_parallel_rpc_port: Optional[int] = None data_parallel_hybrid_lb: bool = False data_parallel_backend: str = ParallelConfig.data_parallel_backend - api_process_count: int = ParallelConfig.api_process_count - api_process_rank: int = ParallelConfig.api_process_rank enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config") enable_eplb: bool = ParallelConfig.enable_eplb diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index ed4745e0b5fa..758ba5a53bec 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -439,12 +439,12 @@ def profile_run(self) -> None: gpu_allocation = allocate_gpu_mm_processors( mm_config.mm_processing_device, - parallel_config.api_process_count, + parallel_config._api_process_count, available_device_count=device_count, engine_device_count=parallel_config.world_size_across_dp, ) - api_process_rank = parallel_config.api_process_rank + api_process_rank = parallel_config._api_process_rank new_device = gpu_allocation[api_process_rank] logger.info("Multi-modal processor will be run on device %s", From d9b42f1d199ccf3c456ca07015b589ddf55810cc Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 19 Sep 2025 06:58:12 +0000 Subject: [PATCH 100/130] Fix Signed-off-by: DarkLight1337 --- tests/multimodal/test_utils.py | 52 ++++++++++++++++++---------------- vllm/v1/engine/processor.py | 1 - 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index 53f3262786ba..34a68aa1fc36 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -201,6 +201,32 @@ async def test_fetch_video_http(video_url: str, num_frames: int): assert metadata_sync == metadata_async +@pytest.mark.asyncio +@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) +@pytest.mark.parametrize("max_duration", [1, 60, 1800]) +@pytest.mark.parametrize("requested_fps", [2, 24]) +async def test_fetch_video_http_with_dynamic_loader( + video_url: str, max_duration: int, requested_fps: int, + monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic") + connector = MediaConnector( + media_io_kwargs={ + "video": { + "max_duration": max_duration, + "requested_fps": requested_fps, + } + }) + + video_sync, metadata_sync = connector.fetch_video(video_url) + video_async, metadata_async = await connector.fetch_video_async( + video_url) + + assert np.array_equal(video_sync, video_async) + assert metadata_sync == metadata_async + assert metadata_sync["video_backend"] == "opencv_dynamic" + + # yapf: disable @pytest.mark.parametrize( "case", @@ -311,31 +337,7 @@ def test_allocate_gpu_mm_processors(case): engine_device_count=engine_device_count, ) - -@pytest.mark.asyncio -@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) -@pytest.mark.parametrize("max_duration", [1, 60, 1800]) -@pytest.mark.parametrize("requested_fps", [2, 24]) -async def test_fetch_video_http_with_dynamic_loader( - video_url: str, max_duration: int, requested_fps: int, - monkeypatch: pytest.MonkeyPatch): - with monkeypatch.context() as m: - m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic") - connector = MediaConnector( - media_io_kwargs={ - "video": { - "max_duration": max_duration, - "requested_fps": requested_fps, - } - }) - - video_sync, metadata_sync = connector.fetch_video(video_url) - video_async, metadata_async = await connector.fetch_video_async( - video_url) - - assert np.array_equal(video_sync, video_async) - assert metadata_sync == metadata_async - assert metadata_sync["video_backend"] == "opencv_dynamic" + assert gpu_allocation == expected_gpu_allocation # yapf: disable diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 802f6afa42a4..07577ee45d98 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -57,7 +57,6 @@ def __init__( self.lora_config = vllm_config.lora_config self.parallel_config = vllm_config.parallel_config self.scheduler_config = vllm_config.scheduler_config - self.decoding_config = vllm_config.decoding_config self.structured_outputs_config = vllm_config.structured_outputs_config self.tokenizer = tokenizer From 0133b0972e1d571beb066b935517edc083f4e92b Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 19 Sep 2025 06:59:48 +0000 Subject: [PATCH 101/130] Fix Signed-off-by: DarkLight1337 --- vllm/v1/engine/processor.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 07577ee45d98..0589f53d41e8 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -39,8 +39,6 @@ logger = init_logger(__name__) -logger = init_logger(__name__) - class Processor: From ccbc13b602060e48f3b5e38d92a17096dde7e30c Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 19 Sep 2025 08:21:17 +0000 Subject: [PATCH 102/130] Update tests Signed-off-by: DarkLight1337 --- tests/entrypoints/llm/test_chat.py | 14 ++++++++++---- .../v1/engine/test_processor_multi_modal_uuids.py | 1 + 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py index 5072e588e1b6..fc9353b2b99e 100644 --- a/tests/entrypoints/llm/test_chat.py +++ b/tests/entrypoints/llm/test_chat.py @@ -204,12 +204,16 @@ def test_chat_extra_kwargs(thinking_llm, enable_thinking): }), ("Qwen/Qwen2-Audio-7B-Instruct", "audio", {}), ]) -def test_mm_processing_gpu(model_id, modality, mm_init_kwargs): +@pytest.mark.parametrize("image_urls", + [[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]], + indirect=True) +def test_mm_processing_gpu(model_id, modality, mm_init_kwargs, + image_urls: list[str]): device = current_platform.device_name num_items = 2 if modality == "image": - messages = dummy_messages_from_image_url(TEST_IMAGE_ASSETS[:num_items]) + messages = dummy_messages_from_image_url(image_urls[:num_items]) elif modality == "audio": messages = dummy_messages_from_audio_url(TEST_AUDIO_URLS[:num_items]) else: @@ -235,14 +239,16 @@ def test_mm_processing_gpu(model_id, modality, mm_init_kwargs): }), ("Qwen/Qwen2-Audio-7B-Instruct", "audio", {}), ]) -def test_mm_processing_gpu_bad_device(model_id, modality, mm_init_kwargs): +@pytest.mark.parametrize("image_urls", [[TEST_IMAGE_ASSETS[0]]], indirect=True) +def test_mm_processing_gpu_bad_device(model_id, modality, mm_init_kwargs, + image_urls: list[str]): device = current_platform.device_name if device == "cpu": pytest.skip("Not applicable to CPU") num_items = 1 if modality == "image": - messages = dummy_messages_from_image_url(TEST_IMAGE_ASSETS[:num_items]) + messages = dummy_messages_from_image_url(image_urls[:num_items]) elif modality == "audio": messages = dummy_messages_from_audio_url(TEST_AUDIO_URLS[:num_items]) else: diff --git a/tests/v1/engine/test_processor_multi_modal_uuids.py b/tests/v1/engine/test_processor_multi_modal_uuids.py index bdd41eece231..cdc825183c47 100644 --- a/tests/v1/engine/test_processor_multi_modal_uuids.py +++ b/tests/v1/engine/test_processor_multi_modal_uuids.py @@ -70,6 +70,7 @@ class _MockMMConfig: def __init__(self, gb: float): self.mm_processor_cache_gb = gb + self.mm_processing_device = "cpu" model_config.multimodal_config = _MockMMConfig( mm_cache_gb) # type: ignore[attr-defined] From f7c3e488f712f85ca440c06d1d65bb782dba10f4 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 19 Sep 2025 17:22:05 +0000 Subject: [PATCH 103/130] Apply device map Signed-off-by: DarkLight1337 --- vllm/multimodal/utils.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index a9bc6b29f1a0..e598581d9cf2 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -24,6 +24,7 @@ from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather) +from vllm.platforms import current_platform from .audio import AudioMediaIO from .base import MediaIO @@ -372,7 +373,12 @@ def allocate_gpu_mm_processors( (device_idx, ) = map(int, rest) processor_gpu_idxs = [device_idx] * mm_processor_count - return [f"{device_type}:{gpu_idx}" for gpu_idx in processor_gpu_idxs] + device_map = current_platform.device_id_to_physical_device_id + + return [ + f"{device_type}:{device_map(gpu_idx)}" + for gpu_idx in processor_gpu_idxs + ] def argsort_mm_positions( From 7fbc6205219e29308b58821b6ce0d33400aacb78 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 19 Sep 2025 17:47:03 +0000 Subject: [PATCH 104/130] Move GPU allocation to `run_multi_api_server` Signed-off-by: DarkLight1337 --- vllm/config/parallel.py | 9 +++++++ vllm/engine/arg_utils.py | 3 +++ vllm/entrypoints/cli/serve.py | 38 ++++++++++++++++++++++++----- vllm/v1/engine/processor.py | 45 +++++++++++++---------------------- 4 files changed, 60 insertions(+), 35 deletions(-) diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 37a41bf6de71..4125dfd3db60 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -212,6 +212,15 @@ class is dynamically inherited by the worker class. This is used to inject should only be set by API server scale-out. """ + _renderer_gpu_allocation: list[str] = field(default_factory=list) + """ + The GPU allocated to the renderer of each API process. + + Note: + This is an internal config that is only valid for and + should only be set by API server scale-out. + """ + @property def world_size_across_dp(self) -> int: """world_size_across_dp is TPxPPxDP, it is the size of the world diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7a4bb0d41d23..087bd6419816 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -335,6 +335,8 @@ class EngineArgs: ParallelConfig.expert_placement_strategy _api_process_count: int = ParallelConfig._api_process_count _api_process_rank: int = ParallelConfig._api_process_rank + _renderer_gpu_allocation: list[ + str] = ParallelConfig._renderer_gpu_allocation num_redundant_experts: int = EPLBConfig.num_redundant_experts eplb_window_size: int = EPLBConfig.window_size eplb_step_interval: int = EPLBConfig.step_interval @@ -1373,6 +1375,7 @@ def create_engine_config( decode_context_parallel_size=self.decode_context_parallel_size, _api_process_count=self._api_process_count, _api_process_rank=self._api_process_rank, + _renderer_gpu_allocation=self._renderer_gpu_allocation, ) speculative_config = self.create_speculative_config( diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index de47bf00932e..12bca9f15877 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -17,6 +17,8 @@ from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG, show_filtered_argument_or_group_from_help) from vllm.logger import init_logger +from vllm.multimodal.utils import allocate_gpu_mm_processors +from vllm.platforms import current_platform from vllm.usage.usage_lib import UsageContext from vllm.utils import (FlexibleArgumentParser, decorate_logs, get_tcp_uri, set_process_title) @@ -150,6 +152,12 @@ def run_multi_api_server(args: argparse.Namespace): usage_context = UsageContext.OPENAI_API_SERVER vllm_config = engine_args.create_engine_config(usage_context=usage_context) + parallel_config = vllm_config.parallel_config + dp_rank = parallel_config.data_parallel_rank + external_dp_lb = parallel_config.data_parallel_external_lb + hybrid_dp_lb = parallel_config.data_parallel_hybrid_lb + assert external_dp_lb or hybrid_dp_lb or dp_rank == 0 + if num_api_servers > 1: if not envs.VLLM_USE_V1: raise ValueError("api_server_count > 1 is only supported for V1") @@ -158,15 +166,33 @@ def run_multi_api_server(args: argparse.Namespace): raise ValueError("VLLM_ALLOW_RUNTIME_LORA_UPDATING cannot be used " "with api_server_count > 1") + mm_config = vllm_config.model_config.multimodal_config + if mm_config and mm_config.mm_processing_device != "cpu": + device_count = current_platform.device_count() # type: ignore + + gpu_allocation = allocate_gpu_mm_processors( + mm_config.mm_processing_device, + parallel_config._api_process_count, + available_device_count=device_count, + engine_device_count=parallel_config.world_size_across_dp, + ) + + for i, device in enumerate(gpu_allocation): + logger.info( + "Multi-modal processor for APIServer_%s will be run " + "on device %s", + i, + device, + ) + + # Note: `engine_args` is sent to API servers + # while vllm_config is sent to EngineCore + engine_args._renderer_gpu_allocation = gpu_allocation + parallel_config._renderer_gpu_allocation = gpu_allocation + executor_class = Executor.get_class(vllm_config) log_stats = not engine_args.disable_log_stats - parallel_config = vllm_config.parallel_config - dp_rank = parallel_config.data_parallel_rank - external_dp_lb = parallel_config.data_parallel_external_lb - hybrid_dp_lb = parallel_config.data_parallel_hybrid_lb - assert external_dp_lb or hybrid_dp_lb or dp_rank == 0 - api_server_manager: Optional[APIServerProcessManager] = None with launch_core_engines(vllm_config, executor_class, log_stats, diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 1f4eba1b54c4..c777626e19e6 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -17,8 +17,7 @@ from vllm.multimodal.cache import processor_cache_from_config from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalUUIDDict from vllm.multimodal.processing import EncDecMultiModalProcessor -from vllm.multimodal.utils import (allocate_gpu_mm_processors, - argsort_mm_positions) +from vllm.multimodal.utils import argsort_mm_positions from vllm.platforms import current_platform from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams @@ -560,25 +559,13 @@ def profile_run(self) -> None: if not mm_config: return - if mm_config.mm_processing_device != "cpu": - # Try to avoid using the same GPU as EngineCore - parallel_config = self.parallel_config - device_count = current_platform.device_count() # type: ignore - - gpu_allocation = allocate_gpu_mm_processors( - mm_config.mm_processing_device, - parallel_config._api_process_count, - available_device_count=device_count, - engine_device_count=parallel_config.world_size_across_dp, - ) - - api_process_rank = parallel_config._api_process_rank - new_device = gpu_allocation[api_process_rank] - - logger.info("Multi-modal processor will be run on device %s", - new_device) - mm_config.update_mm_processor_kwargs({"device": new_device}) + parallel_config = self.parallel_config + api_process_rank = parallel_config._api_process_rank + gpu_allocation = parallel_config._renderer_gpu_allocation + device = gpu_allocation[api_process_rank] + mm_config.update_mm_processor_kwargs({"device": device}) + if mm_config.mm_processing_device != "cpu": # Peak memory usage (required for this profiling) # is only tracked for CUDA if not current_platform.is_cuda_alike(): @@ -589,7 +576,7 @@ def profile_run(self) -> None: # device. # Compared to running profiling on every Processor in parallel, # this avoids non-deterministic peak memory usage calculation. - if api_process_rank != gpu_allocation.index(new_device): + if api_process_rank != gpu_allocation.index(device): return scheduler_config = self.scheduler_config @@ -599,18 +586,18 @@ def profile_run(self) -> None: self.mm_registry, ) - baseline_snapshot = MemorySnapshot(device=new_device) + baseline_snapshot = MemorySnapshot(device=device) # Only check init memory if we are sure that the EngineCore is not # loading weights or running profiling on the same GPU - # TODO: world_size_across_dp is too conservative for multi-node - new_device_index = torch.device(new_device).index or 0 - if new_device_index < parallel_config.world_size_across_dp: + new_device_index = torch.device(device).index or 0 + local_engine_count = parallel_config.data_parallel_size_local + if new_device_index < local_engine_count: logger.warning( "Both EngineCore and multi-modal processor are using " "the same GPU (%s). This may result in inaccurate memory " "profiling, and resource contention during inference.", - new_device, + device, ) else: check_enough_init_memory(baseline_snapshot, self.cache_config) @@ -624,16 +611,16 @@ def profile_run(self) -> None: mm_counts={modality: max_items_per_prompt}, ) - usage_mult = gpu_allocation.count(new_device) + usage_mult = gpu_allocation.count(device) memory_usage = diff.torch_peak_increase * usage_mult logger.info( "Multi-modal processing took %.4f GiB and %.6f seconds on %s", memory_usage / GiB_bytes, diff.profile_time, - new_device, + device, ) if memory_usage > diff.before_profile.free_memory: - raise ValueError(f"Not enough memory in {new_device} " + raise ValueError(f"Not enough memory in {device} " f"for multi-modal processor. " f"Try reducing `api_server_count`.") From 869ee6c1cd27009cbdbb0503a6ecb7cdfacd6689 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 19 Sep 2025 18:03:52 +0000 Subject: [PATCH 105/130] Fix Signed-off-by: DarkLight1337 --- vllm/config/__init__.py | 12 ++++++++++++ vllm/config/parallel.py | 2 +- vllm/engine/arg_utils.py | 4 ++-- vllm/v1/engine/processor.py | 8 +++++--- 4 files changed, 20 insertions(+), 6 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index ddd8de4324f6..28a7d9100205 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -647,6 +647,18 @@ def __post_init__(self): "the VLLM_ALL2ALL_BACKEND environment variable to "\ "deepep_low_latency and install the DeepEP kerenls." + if (mm_config := self.model_config.multimodal_config): + if self.parallel_config._renderer_gpu_allocation is None: + assert self.parallel_config._api_process_count == 1 + assert self.parallel_config._api_process_rank == 0 + self.parallel_config._renderer_gpu_allocation = [ + mm_config.mm_processing_device + ] + else: + device = self.parallel_config._renderer_gpu_allocation[ + self.parallel_config._api_process_rank] + mm_config.update_mm_processor_kwargs({"device": device}) + if not self.instance_id: self.instance_id = random_uuid()[:5] diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 4125dfd3db60..13e7d03794c7 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -212,7 +212,7 @@ class is dynamically inherited by the worker class. This is used to inject should only be set by API server scale-out. """ - _renderer_gpu_allocation: list[str] = field(default_factory=list) + _renderer_gpu_allocation: Optional[list[str]] = None """ The GPU allocated to the renderer of each API process. diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 087bd6419816..b9845e74f69c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -335,8 +335,8 @@ class EngineArgs: ParallelConfig.expert_placement_strategy _api_process_count: int = ParallelConfig._api_process_count _api_process_rank: int = ParallelConfig._api_process_rank - _renderer_gpu_allocation: list[ - str] = ParallelConfig._renderer_gpu_allocation + _renderer_gpu_allocation: Optional[ + list[str]] = ParallelConfig._renderer_gpu_allocation num_redundant_experts: int = EPLBConfig.num_redundant_experts eplb_window_size: int = EPLBConfig.window_size eplb_step_interval: int = EPLBConfig.step_interval diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index c777626e19e6..0176d1409183 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -560,10 +560,12 @@ def profile_run(self) -> None: return parallel_config = self.parallel_config - api_process_rank = parallel_config._api_process_rank gpu_allocation = parallel_config._renderer_gpu_allocation - device = gpu_allocation[api_process_rank] - mm_config.update_mm_processor_kwargs({"device": device}) + if not gpu_allocation: + return + + api_process_rank = parallel_config._api_process_rank + device = mm_config.mm_processing_device if mm_config.mm_processing_device != "cpu": # Peak memory usage (required for this profiling) From dd6520857c6df39659faf8d30626a85911f339e3 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 19 Sep 2025 18:05:44 +0000 Subject: [PATCH 106/130] Add code comment Signed-off-by: DarkLight1337 --- vllm/config/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 28a7d9100205..4f9782196588 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -654,7 +654,7 @@ def __post_init__(self): self.parallel_config._renderer_gpu_allocation = [ mm_config.mm_processing_device ] - else: + else: # GPU allocation is already set by API server scale-out device = self.parallel_config._renderer_gpu_allocation[ self.parallel_config._api_process_rank] mm_config.update_mm_processor_kwargs({"device": device}) From a508b06fecdd2b6dde06afdac2d89c6f453fa17c Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 19 Sep 2025 18:18:43 +0000 Subject: [PATCH 107/130] Move allocation to config Signed-off-by: DarkLight1337 --- vllm/config/__init__.py | 36 ++++++++++++++++++++++++----------- vllm/config/multimodal.py | 4 ++++ vllm/config/parallel.py | 2 +- vllm/entrypoints/cli/serve.py | 26 ------------------------- vllm/v1/engine/processor.py | 5 +++-- 5 files changed, 33 insertions(+), 40 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 4f9782196588..397100f78001 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -647,17 +647,31 @@ def __post_init__(self): "the VLLM_ALL2ALL_BACKEND environment variable to "\ "deepep_low_latency and install the DeepEP kerenls." - if (mm_config := self.model_config.multimodal_config): - if self.parallel_config._renderer_gpu_allocation is None: - assert self.parallel_config._api_process_count == 1 - assert self.parallel_config._api_process_rank == 0 - self.parallel_config._renderer_gpu_allocation = [ - mm_config.mm_processing_device - ] - else: # GPU allocation is already set by API server scale-out - device = self.parallel_config._renderer_gpu_allocation[ - self.parallel_config._api_process_rank] - mm_config.update_mm_processor_kwargs({"device": device}) + mm_config = self.model_config.multimodal_config + if mm_config and mm_config.mm_processing_device != "cpu": + api_process_count = self.parallel_config._api_process_count + api_process_rank = self.parallel_config._api_process_rank + local_gpu_count = (self.parallel_config.data_parallel_size_local * + self.parallel_config.world_size) + + if api_process_rank != -1: + from vllm.multimodal.utils import allocate_gpu_mm_processors + + device_count = current_platform.device_count() # type: ignore + + gpu_allocation = allocate_gpu_mm_processors( + mm_config.mm_processing_device, + api_process_count, + available_device_count=device_count, + engine_device_count=local_gpu_count, + ) + device = gpu_allocation[api_process_rank] + + logger.info("Multi-modal processor will be run on device %s", + device) + + self.parallel_config._renderer_gpu_allocation = gpu_allocation + mm_config.mm_processing_device = device if not self.instance_id: self.instance_id = random_uuid()[:5] diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index f76e34525eef..8e31edbe2b6e 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -84,6 +84,10 @@ def mm_processing_device(self) -> str: kwargs = self.mm_processor_kwargs or {} return str(kwargs.get("device", "cpu")) + @mm_processing_device.setter + def mm_processing_device(self, device: str) -> None: + self.update_mm_processor_kwargs({"device": device}) + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 13e7d03794c7..8f921635da1f 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -218,7 +218,7 @@ class is dynamically inherited by the worker class. This is used to inject Note: This is an internal config that is only valid for and - should only be set by API server scale-out. + should only be set internally. """ @property diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 12bca9f15877..bedb788c27b8 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -17,8 +17,6 @@ from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG, show_filtered_argument_or_group_from_help) from vllm.logger import init_logger -from vllm.multimodal.utils import allocate_gpu_mm_processors -from vllm.platforms import current_platform from vllm.usage.usage_lib import UsageContext from vllm.utils import (FlexibleArgumentParser, decorate_logs, get_tcp_uri, set_process_title) @@ -166,30 +164,6 @@ def run_multi_api_server(args: argparse.Namespace): raise ValueError("VLLM_ALLOW_RUNTIME_LORA_UPDATING cannot be used " "with api_server_count > 1") - mm_config = vllm_config.model_config.multimodal_config - if mm_config and mm_config.mm_processing_device != "cpu": - device_count = current_platform.device_count() # type: ignore - - gpu_allocation = allocate_gpu_mm_processors( - mm_config.mm_processing_device, - parallel_config._api_process_count, - available_device_count=device_count, - engine_device_count=parallel_config.world_size_across_dp, - ) - - for i, device in enumerate(gpu_allocation): - logger.info( - "Multi-modal processor for APIServer_%s will be run " - "on device %s", - i, - device, - ) - - # Note: `engine_args` is sent to API servers - # while vllm_config is sent to EngineCore - engine_args._renderer_gpu_allocation = gpu_allocation - parallel_config._renderer_gpu_allocation = gpu_allocation - executor_class = Executor.get_class(vllm_config) log_stats = not engine_args.disable_log_stats diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 0176d1409183..781d3569674b 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -593,8 +593,9 @@ def profile_run(self) -> None: # Only check init memory if we are sure that the EngineCore is not # loading weights or running profiling on the same GPU new_device_index = torch.device(device).index or 0 - local_engine_count = parallel_config.data_parallel_size_local - if new_device_index < local_engine_count: + local_gpu_count = (parallel_config.data_parallel_size_local * + parallel_config.world_size) + if new_device_index < local_gpu_count: logger.warning( "Both EngineCore and multi-modal processor are using " "the same GPU (%s). This may result in inaccurate memory " From cf26387d2470299dbb68ba707e43cdbfc678c413 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 19 Sep 2025 18:22:09 +0000 Subject: [PATCH 108/130] Remove device map Signed-off-by: DarkLight1337 --- vllm/multimodal/utils.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index e598581d9cf2..a9bc6b29f1a0 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -24,7 +24,6 @@ from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather) -from vllm.platforms import current_platform from .audio import AudioMediaIO from .base import MediaIO @@ -373,12 +372,7 @@ def allocate_gpu_mm_processors( (device_idx, ) = map(int, rest) processor_gpu_idxs = [device_idx] * mm_processor_count - device_map = current_platform.device_id_to_physical_device_id - - return [ - f"{device_type}:{device_map(gpu_idx)}" - for gpu_idx in processor_gpu_idxs - ] + return [f"{device_type}:{gpu_idx}" for gpu_idx in processor_gpu_idxs] def argsort_mm_positions( From 0c2c4a63e037b3f8e9951cce1be4f23aef4c8bed Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 19 Sep 2025 18:23:57 +0000 Subject: [PATCH 109/130] Reduce diff Signed-off-by: DarkLight1337 --- vllm/entrypoints/cli/serve.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index bedb788c27b8..de47bf00932e 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -150,12 +150,6 @@ def run_multi_api_server(args: argparse.Namespace): usage_context = UsageContext.OPENAI_API_SERVER vllm_config = engine_args.create_engine_config(usage_context=usage_context) - parallel_config = vllm_config.parallel_config - dp_rank = parallel_config.data_parallel_rank - external_dp_lb = parallel_config.data_parallel_external_lb - hybrid_dp_lb = parallel_config.data_parallel_hybrid_lb - assert external_dp_lb or hybrid_dp_lb or dp_rank == 0 - if num_api_servers > 1: if not envs.VLLM_USE_V1: raise ValueError("api_server_count > 1 is only supported for V1") @@ -167,6 +161,12 @@ def run_multi_api_server(args: argparse.Namespace): executor_class = Executor.get_class(vllm_config) log_stats = not engine_args.disable_log_stats + parallel_config = vllm_config.parallel_config + dp_rank = parallel_config.data_parallel_rank + external_dp_lb = parallel_config.data_parallel_external_lb + hybrid_dp_lb = parallel_config.data_parallel_hybrid_lb + assert external_dp_lb or hybrid_dp_lb or dp_rank == 0 + api_server_manager: Optional[APIServerProcessManager] = None with launch_core_engines(vllm_config, executor_class, log_stats, From 3426a20b6bb440ff4517ab007fced0afc48aeea0 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 19 Sep 2025 18:26:23 +0000 Subject: [PATCH 110/130] Remove from init Signed-off-by: DarkLight1337 --- vllm/engine/arg_utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b9845e74f69c..7a4bb0d41d23 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -335,8 +335,6 @@ class EngineArgs: ParallelConfig.expert_placement_strategy _api_process_count: int = ParallelConfig._api_process_count _api_process_rank: int = ParallelConfig._api_process_rank - _renderer_gpu_allocation: Optional[ - list[str]] = ParallelConfig._renderer_gpu_allocation num_redundant_experts: int = EPLBConfig.num_redundant_experts eplb_window_size: int = EPLBConfig.window_size eplb_step_interval: int = EPLBConfig.step_interval @@ -1375,7 +1373,6 @@ def create_engine_config( decode_context_parallel_size=self.decode_context_parallel_size, _api_process_count=self._api_process_count, _api_process_rank=self._api_process_rank, - _renderer_gpu_allocation=self._renderer_gpu_allocation, ) speculative_config = self.create_speculative_config( From 1a5dd58846b788fcf77dcecf09869f1e0767b78c Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 19 Sep 2025 18:29:33 +0000 Subject: [PATCH 111/130] Clean Signed-off-by: DarkLight1337 --- vllm/v1/engine/processor.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 781d3569674b..5200cbb322d5 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -564,10 +564,8 @@ def profile_run(self) -> None: if not gpu_allocation: return - api_process_rank = parallel_config._api_process_rank device = mm_config.mm_processing_device - - if mm_config.mm_processing_device != "cpu": + if device != "cpu": # Peak memory usage (required for this profiling) # is only tracked for CUDA if not current_platform.is_cuda_alike(): @@ -578,6 +576,7 @@ def profile_run(self) -> None: # device. # Compared to running profiling on every Processor in parallel, # this avoids non-deterministic peak memory usage calculation. + api_process_rank = parallel_config._api_process_rank if api_process_rank != gpu_allocation.index(device): return @@ -625,7 +624,8 @@ def profile_run(self) -> None: if memory_usage > diff.before_profile.free_memory: raise ValueError(f"Not enough memory in {device} " f"for multi-modal processor. " - f"Try reducing `api_server_count`.") + f"Try reducing `api_server_count` or " + f"revert to CPU processing.") def clear_cache(self) -> None: self.input_preprocessor.clear_cache() From e4a7e270b2236ce02489dcd6427ac677d69040dc Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 20 Sep 2025 01:21:48 +0000 Subject: [PATCH 112/130] Guard model config Signed-off-by: DarkLight1337 --- vllm/config/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 397100f78001..cdf4644a07c9 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -647,7 +647,8 @@ def __post_init__(self): "the VLLM_ALL2ALL_BACKEND environment variable to "\ "deepep_low_latency and install the DeepEP kerenls." - mm_config = self.model_config.multimodal_config + mm_config = (self.model_config.multimodal_config + if self.model_config else None) if mm_config and mm_config.mm_processing_device != "cpu": api_process_count = self.parallel_config._api_process_count api_process_rank = self.parallel_config._api_process_rank From fe159f673f7602700b30773ee8725be706c3913d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 22 Sep 2025 10:04:25 +0000 Subject: [PATCH 113/130] Help debug Signed-off-by: DarkLight1337 --- vllm/inputs/registry.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 73ad839951e1..307e261e7ab0 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -197,6 +197,7 @@ def call_hf_processor( except Exception as exc: msg = (f"Failed to apply {type(hf_processor).__name__} " f"on data={data} with kwargs={allowed_kwargs}") + logger.exception(exc) raise ValueError(msg) from exc From 5629f00a8d8881c9c2afed9645a3d053fbda8c3d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 23 Sep 2025 08:09:00 +0000 Subject: [PATCH 114/130] Remove debug Signed-off-by: DarkLight1337 --- vllm/inputs/registry.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 307e261e7ab0..73ad839951e1 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -197,7 +197,6 @@ def call_hf_processor( except Exception as exc: msg = (f"Failed to apply {type(hf_processor).__name__} " f"on data={data} with kwargs={allowed_kwargs}") - logger.exception(exc) raise ValueError(msg) from exc From 033361e3e39e7a9c482b6de074226c164375ed0d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 6 Oct 2025 03:10:26 +0000 Subject: [PATCH 115/130] ruff Signed-off-by: DarkLight1337 --- .pre-commit-config.yaml | 16 +- pyproject.toml | 78 +- tests/entrypoints/llm/test_chat.py | 131 ++- tests/entrypoints/openai/test_audio.py | 196 ++--- tests/entrypoints/openai/test_vision.py | 234 +++--- tests/multimodal/test_utils.py | 88 +- .../test_processor_multi_modal_uuids.py | 114 ++- vllm/config/__init__.py | 511 +++++++----- vllm/config/multimodal.py | 6 +- vllm/config/parallel.py | 148 ++-- vllm/multimodal/processing.py | 333 ++++---- vllm/multimodal/utils.py | 120 +-- vllm/utils/__init__.py | 758 ++++++++++-------- vllm/v1/engine/processor.py | 194 +++-- vllm/v1/worker/gpu_worker.py | 376 +++++---- vllm/v1/worker/utils.py | 50 +- 16 files changed, 1822 insertions(+), 1531 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8ca414ee4269..95a3866e6bb8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,28 +6,16 @@ default_stages: - manual # Run in CI exclude: 'vllm/third_party/.*' repos: -- repo: https://github.com/google/yapf - rev: v0.43.0 - hooks: - - id: yapf - args: [--in-place, --verbose] - # Keep the same list from yapfignore here to avoid yapf failing without any inputs - exclude: '(.buildkite|benchmarks|build|examples)/.*' - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.7 + rev: v0.13.3 hooks: - - id: ruff + - id: ruff-check args: [--output-format, github, --fix] - id: ruff-format - files: ^(.buildkite|benchmarks|examples)/.* - repo: https://github.com/crate-ci/typos rev: v1.35.5 hooks: - id: typos -- repo: https://github.com/PyCQA/isort - rev: 6.0.1 - hooks: - - id: isort - repo: https://github.com/pre-commit/mirrors-clang-format rev: v20.1.3 hooks: diff --git a/pyproject.toml b/pyproject.toml index 88c5c4067f5a..b3cae3d00cb4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,27 +52,56 @@ lora_filesystem_resolver = "vllm.plugins.lora_resolvers.filesystem_resolver:regi where = ["."] include = ["vllm*"] -[tool.yapfignore] -ignore_patterns = [ - ".buildkite/**", - "benchmarks/**", - "build/**", - "examples/**", -] - -[tool.ruff] -# Allow lines to be as long as 80. -line-length = 80 - [tool.ruff.lint.per-file-ignores] "vllm/third_party/**" = ["ALL"] "vllm/version.py" = ["F401"] "vllm/_version.py" = ["ALL"] -# Python 3.8 typing - skip V0 code -"vllm/attention/**/*.py" = ["UP006", "UP035"] -"vllm/engine/**/*.py" = ["UP006", "UP035"] -"vllm/executor/**/*.py" = ["UP006", "UP035"] -"vllm/worker/**/*.py" = ["UP006", "UP035"] +# TEMPORARY! These ignores will be fixed forward +## Line length violations +"csrc/cutlass_extensions/vllm_cutlass_library_extension.py" = ["E501"] +"tests/compile/piecewise/test_simple.py" = ["E501"] +"tests/compile/piecewise/test_toy_llama.py" = ["E501", "B023"] +"tests/entrypoints/conftest.py" = ["E501"] +"tests/entrypoints/openai/test_audio.py" = ["E501"] +"tests/entrypoints/openai/test_chat.py" = ["E501"] +"tests/entrypoints/openai/test_chat_template.py" = ["E501"] +"tests/entrypoints/openai/test_chat_with_tool_reasoning.py" = ["E501"] +"tests/entrypoints/openai/test_completion_with_function_calling.py" = ["E501"] +"tests/entrypoints/openai/test_video.py" = ["E501"] +"tests/entrypoints/openai/test_vision.py" = ["E501"] +"tests/entrypoints/test_chat_utils.py" = ["E501"] +"tests/kernels/moe/modular_kernel_tools/common.py" = ["E501"] +"tests/models/language/generation/test_gemma.py" = ["E501"] +"tests/models/language/generation/test_mistral.py" = ["E501"] +"tests/models/multimodal/generation/test_ultravox.py" = ["E501"] +"tests/models/multimodal/generation/test_voxtral.py" = ["E501"] +"tests/models/multimodal/generation/vlm_utils/custom_inputs.py" = ["E501"] +"tests/tool_use/test_tool_choice_required.py" = ["E501"] +"tests/v1/attention/utils.py" = ["E501"] +"tests/v1/entrypoints/openai/responses/test_image.py" = ["E501"] +"tests/v1/kv_connector/nixl_integration/test_accuracy.py" = ["E501"] +"tests/v1/kv_connector/unit/test_offloading_connector.py" = ["E501"] +"tests/v1/logits_processors/test_custom_offline.py" = ["E501"] +"vllm/attention/ops/pallas_kv_cache_update.py" = ["E501"] +"vllm/compilation/collective_fusion.py" = ["E501"] +"vllm/compilation/wrapper.py" = ["E501"] +"vllm/config/vllm.py" = ["E501"] +"vllm/distributed/device_communicators/all2all.py" = ["E501"] +"vllm/entrypoints/openai/protocol.py" = ["E501"] +"vllm/lora/layers/vocal_parallel_embedding.py" = ["E501"] +"vllm/model_executor/model_loader/bitsandbytes_loader.py" = ["E501"] +"vllm/model_executor/models/bailing_moe.py" = ["E501"] +"vllm/model_executor/models/hyperclovax_vision.py" = ["E501"] +"vllm/model_executor/models/llama4_eagle.py" = ["E501"] +"vllm/model_executor/models/longcat_flash_mtp.py" = ["E501"] +"vllm/model_executor/models/phi4mm.py" = ["E501"] +"vllm/model_executor/models/qwen3_next.py" = ["E501"] +"vllm/model_executor/layers/quantization/ptpc_fp8.py" = ["E501"] +"vllm/v1/attention/backends/mla/common.py" = ["E501"] +"vllm/v1/engine/utils.py" = ["E501"] +"vllm/v1/utils.py" = ["E501"] +"vllm/v1/worker/gpu_model_runner.py" = ["E501"] +# End of temporary ignores [tool.ruff.lint] select = [ @@ -87,7 +116,7 @@ select = [ # flake8-simplify "SIM", # isort - # "I", + "I", # flake8-logging-format "G", ] @@ -104,21 +133,15 @@ ignore = [ "UP007", ] +[tool.ruff.format] +docstring-code-format = true + [tool.mypy] plugins = ['pydantic.mypy'] ignore_missing_imports = true check_untyped_defs = true follow_imports = "silent" -[tool.isort] -skip_glob = [ - ".buildkite/*", - "benchmarks/*", - "examples/*", -] -use_parentheses = true -skip_gitignore = true - [tool.pytest.ini_options] markers = [ "slow_test", @@ -126,6 +149,7 @@ markers = [ "core_model: enable this model test in each PR instead of only nightly", "hybrid_model: models that contain mamba layers (including pure SSM and hybrid architectures)", "cpu_model: enable this model test in CPU tests", + "cpu_test: mark test as CPU-only test", "split: run this test as part of a split", "distributed: run this test only in distributed GPU tests", "skip_v1: do not run this test with v1", diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py index fc9353b2b99e..e75e0e3d1fcb 100644 --- a/tests/entrypoints/llm/test_chat.py +++ b/tests/entrypoints/llm/test_chat.py @@ -9,17 +9,14 @@ from vllm.platforms import current_platform from ..openai.test_audio import TEST_AUDIO_URLS, dummy_messages_from_audio_url -from ..openai.test_vision import (TEST_IMAGE_ASSETS, - dummy_messages_from_image_url) +from ..openai.test_vision import TEST_IMAGE_ASSETS, dummy_messages_from_image_url @pytest.fixture(scope="function") def text_llm(): # pytest caches the fixture so we use weakref.proxy to # enable garbage collection - llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", - enforce_eager=True, - seed=0) + llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", enforce_eager=True, seed=0) yield weakref.proxy(llm) @@ -31,14 +28,8 @@ def text_llm(): def test_chat(text_llm): prompt1 = "Explain the concept of entropy." messages = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": prompt1 - }, + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": prompt1}, ] outputs = text_llm.chat(messages) assert len(outputs) == 1 @@ -49,25 +40,13 @@ def test_multi_chat(text_llm): prompt2 = "Explain what among us is." conversation1 = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": prompt1 - }, + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": prompt1}, ] conversation2 = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": prompt2 - }, + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": prompt2}, ] messages = [conversation1, conversation2] @@ -97,26 +76,22 @@ def vision_llm(): cleanup_dist_env_and_memory() -@pytest.mark.parametrize("image_urls", - [[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]], - indirect=True) +@pytest.mark.parametrize( + "image_urls", [[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]], indirect=True +) def test_chat_multi_image(vision_llm, image_urls: list[str]): - messages = [{ - "role": - "user", - "content": [ - *({ - "type": "image_url", - "image_url": { - "url": image_url - } - } for image_url in image_urls), - { - "type": "text", - "text": "What's in this image?" - }, - ], - }] + messages = [ + { + "role": "user", + "content": [ + *( + {"type": "image_url", "image_url": {"url": image_url}} + for image_url in image_urls + ), + {"type": "text", "text": "What's in this image?"}, + ], + } + ] outputs = vision_llm.chat(messages) assert len(outputs) >= 0 @@ -127,14 +102,8 @@ def test_llm_chat_tokenization_no_double_bos(text_llm): Check we get a single BOS token for llama chat. """ messages = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": "Hello!" - }, + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": "Hello!"}, ] outputs = text_llm.chat(messages) assert len(outputs) == 1 @@ -170,14 +139,8 @@ def thinking_llm(): @pytest.mark.parametrize("enable_thinking", [True, False]) def test_chat_extra_kwargs(thinking_llm, enable_thinking): messages = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": "What is 1+1?" - }, + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": "What is 1+1?"}, ] outputs = thinking_llm.chat( @@ -198,17 +161,17 @@ def test_chat_extra_kwargs(thinking_llm, enable_thinking): assert think_id in prompt_token_ids -@pytest.mark.parametrize(("model_id", "modality", "mm_init_kwargs"), [ - ("Qwen/Qwen2.5-VL-3B-Instruct", "image", { - "use_fast": True - }), - ("Qwen/Qwen2-Audio-7B-Instruct", "audio", {}), -]) -@pytest.mark.parametrize("image_urls", - [[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]], - indirect=True) -def test_mm_processing_gpu(model_id, modality, mm_init_kwargs, - image_urls: list[str]): +@pytest.mark.parametrize( + ("model_id", "modality", "mm_init_kwargs"), + [ + ("Qwen/Qwen2.5-VL-3B-Instruct", "image", {"use_fast": True}), + ("Qwen/Qwen2-Audio-7B-Instruct", "audio", {}), + ], +) +@pytest.mark.parametrize( + "image_urls", [[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]], indirect=True +) +def test_mm_processing_gpu(model_id, modality, mm_init_kwargs, image_urls: list[str]): device = current_platform.device_name num_items = 2 @@ -233,15 +196,17 @@ def test_mm_processing_gpu(model_id, modality, mm_init_kwargs, assert len(outputs) == 1 -@pytest.mark.parametrize(("model_id", "modality", "mm_init_kwargs"), [ - ("Qwen/Qwen2.5-VL-3B-Instruct", "image", { - "use_fast": True - }), - ("Qwen/Qwen2-Audio-7B-Instruct", "audio", {}), -]) +@pytest.mark.parametrize( + ("model_id", "modality", "mm_init_kwargs"), + [ + ("Qwen/Qwen2.5-VL-3B-Instruct", "image", {"use_fast": True}), + ("Qwen/Qwen2-Audio-7B-Instruct", "audio", {}), + ], +) @pytest.mark.parametrize("image_urls", [[TEST_IMAGE_ASSETS[0]]], indirect=True) -def test_mm_processing_gpu_bad_device(model_id, modality, mm_init_kwargs, - image_urls: list[str]): +def test_mm_processing_gpu_bad_device( + model_id, modality, mm_init_kwargs, image_urls: list[str] +): device = current_platform.device_name if device == "cpu": pytest.skip("Not applicable to CPU") diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index 97ca3f2af72f..05b135d8cf64 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -61,29 +61,26 @@ def dummy_messages_from_audio_url( if isinstance(audio_urls, str): audio_urls = [audio_urls] - return [{ - "role": - "user", - "content": [ - *({ - "type": "audio_url", - "audio_url": { - "url": audio_url - } - } for audio_url in audio_urls), - { - "type": "text", - "text": content_text - }, - ], - }] + return [ + { + "role": "user", + "content": [ + *( + {"type": "audio_url", "audio_url": {"url": audio_url}} + for audio_url in audio_urls + ), + {"type": "text", "text": content_text}, + ], + } + ] @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]]) -async def test_single_chat_session_audio(client: openai.AsyncOpenAI, - model_name: str, audio_url: str): +async def test_single_chat_session_audio( + client: openai.AsyncOpenAI, model_name: str, audio_url: str +): messages = dummy_messages_from_audio_url(audio_url) # test single completion @@ -93,13 +90,15 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI, max_completion_tokens=10, logprobs=True, temperature=0.0, - top_logprobs=5) + top_logprobs=5, + ) assert len(chat_completion.choices) == 1 choice = chat_completion.choices[0] assert choice.finish_reason == "length" assert chat_completion.usage == openai.types.CompletionUsage( - completion_tokens=10, prompt_tokens=202, total_tokens=212) + completion_tokens=10, prompt_tokens=202, total_tokens=212 + ) message = choice.message message = chat_completion.choices[0].message @@ -121,41 +120,41 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]]) -async def test_error_on_invalid_audio_url_type(client: openai.AsyncOpenAI, - model_name: str, - audio_url: str): - messages = [{ - "role": - "user", - "content": [ - { - "type": "audio_url", - "audio_url": audio_url - }, - { - "type": "text", - "text": "What's happening in this audio?" - }, - ], - }] +async def test_error_on_invalid_audio_url_type( + client: openai.AsyncOpenAI, model_name: str, audio_url: str +): + messages = [ + { + "role": "user", + "content": [ + {"type": "audio_url", "audio_url": audio_url}, + {"type": "text", "text": "What's happening in this audio?"}, + ], + } + ] # audio_url should be a dict {"url": "some url"}, not directly a string with pytest.raises(openai.BadRequestError): - _ = await client.chat.completions.create(model=model_name, - messages=messages, - max_completion_tokens=10, - temperature=0.0) + _ = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + temperature=0.0, + ) @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]]) async def test_single_chat_session_audio_base64encoded( - client: openai.AsyncOpenAI, model_name: str, audio_url: str, - base64_encoded_audio: dict[str, str]): - + client: openai.AsyncOpenAI, + model_name: str, + audio_url: str, + base64_encoded_audio: dict[str, str], +): messages = dummy_messages_from_audio_url( - f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}") + f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}" + ) # test single completion chat_completion = await client.chat.completions.create( @@ -164,13 +163,15 @@ async def test_single_chat_session_audio_base64encoded( max_completion_tokens=10, logprobs=True, temperature=0.0, - top_logprobs=5) + top_logprobs=5, + ) assert len(chat_completion.choices) == 1 choice = chat_completion.choices[0] assert choice.finish_reason == "length" assert chat_completion.usage == openai.types.CompletionUsage( - completion_tokens=10, prompt_tokens=202, total_tokens=212) + completion_tokens=10, prompt_tokens=202, total_tokens=212 + ) message = choice.message message = chat_completion.choices[0].message @@ -194,25 +195,26 @@ async def test_single_chat_session_audio_base64encoded( @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]]) async def test_single_chat_session_input_audio( - client: openai.AsyncOpenAI, model_name: str, audio_url: str, - base64_encoded_audio: dict[str, str]): - messages = [{ - "role": - "user", - "content": [ - { - "type": "input_audio", - "input_audio": { - "data": base64_encoded_audio[audio_url], - "format": "wav" - } - }, - { - "type": "text", - "text": "What's happening in this audio?" - }, - ], - }] + client: openai.AsyncOpenAI, + model_name: str, + audio_url: str, + base64_encoded_audio: dict[str, str], +): + messages = [ + { + "role": "user", + "content": [ + { + "type": "input_audio", + "input_audio": { + "data": base64_encoded_audio[audio_url], + "format": "wav", + }, + }, + {"type": "text", "text": "What's happening in this audio?"}, + ], + } + ] # test single completion chat_completion = await client.chat.completions.create( @@ -220,13 +222,15 @@ async def test_single_chat_session_input_audio( messages=messages, max_completion_tokens=10, logprobs=True, - top_logprobs=5) + top_logprobs=5, + ) assert len(chat_completion.choices) == 1 choice = chat_completion.choices[0] assert choice.finish_reason == "length" assert chat_completion.usage == openai.types.CompletionUsage( - completion_tokens=10, prompt_tokens=202, total_tokens=212) + completion_tokens=10, prompt_tokens=202, total_tokens=212 + ) message = choice.message message = chat_completion.choices[0].message @@ -248,8 +252,9 @@ async def test_single_chat_session_input_audio( @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) -async def test_chat_streaming_audio(client: openai.AsyncOpenAI, - model_name: str, audio_url: str): +async def test_chat_streaming_audio( + client: openai.AsyncOpenAI, model_name: str, audio_url: str +): messages = dummy_messages_from_audio_url(audio_url) # test single completion @@ -290,27 +295,27 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) -async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI, - model_name: str, audio_url: str, - base64_encoded_audio: dict[str, - str]): - messages = [{ - "role": - "user", - "content": [ - { - "type": "input_audio", - "input_audio": { - "data": base64_encoded_audio[audio_url], - "format": "wav" - } - }, - { - "type": "text", - "text": "What's happening in this audio?" - }, - ], - }] +async def test_chat_streaming_input_audio( + client: openai.AsyncOpenAI, + model_name: str, + audio_url: str, + base64_encoded_audio: dict[str, str], +): + messages = [ + { + "role": "user", + "content": [ + { + "type": "input_audio", + "input_audio": { + "data": base64_encoded_audio[audio_url], + "format": "wav", + }, + }, + {"type": "text", "text": "What's happening in this audio?"}, + ], + } + ] # test single completion chat_completion = await client.chat.completions.create( @@ -350,10 +355,11 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize( - "audio_urls", [TEST_AUDIO_URLS, TEST_AUDIO_URLS + [TEST_AUDIO_URLS[0]]]) -async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str, - audio_urls: list[str]): - + "audio_urls", [TEST_AUDIO_URLS, TEST_AUDIO_URLS + [TEST_AUDIO_URLS[0]]] +) +async def test_multi_audio_input( + client: openai.AsyncOpenAI, model_name: str, audio_urls: list[str] +): messages = dummy_messages_from_audio_url(audio_urls) if len(audio_urls) > MAXIMUM_AUDIOS: diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 512829b1ac41..2a6c0920471b 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -72,8 +72,9 @@ async def client(server): @pytest.fixture(scope="session") def base64_encoded_image(local_asset_server) -> dict[str, str]: return { - image_asset: - encode_image_base64(local_asset_server.get_image_asset(image_asset)) + image_asset: encode_image_base64( + local_asset_server.get_image_asset(image_asset) + ) for image_asset in TEST_IMAGE_ASSETS } @@ -85,38 +86,37 @@ def dummy_messages_from_image_url( if isinstance(image_urls, str): image_urls = [image_urls] - return [{ - "role": - "user", - "content": [ - *({ - "type": "image_url", - "image_url": { - "url": image_url - } - } for image_url in image_urls), - { - "type": "text", - "text": content_text - }, - ], - }] + return [ + { + "role": "user", + "content": [ + *( + {"type": "image_url", "image_url": {"url": image_url}} + for image_url in image_urls + ), + {"type": "text", "text": content_text}, + ], + } + ] def get_hf_prompt_tokens(model_name, content, image_url): - processor = AutoProcessor.from_pretrained(model_name, - trust_remote_code=True, - num_crops=4) + processor = AutoProcessor.from_pretrained( + model_name, trust_remote_code=True, num_crops=4 + ) placeholder = "<|image_1|>\n" - messages = [{ - "role": "user", - "content": f"{placeholder}{content}", - }] + messages = [ + { + "role": "user", + "content": f"{placeholder}{content}", + } + ] images = [fetch_image(image_url)] prompt = processor.tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True) + messages, tokenize=False, add_generation_prompt=True + ) inputs = processor(prompt, images, return_tensors="pt") return inputs.input_ids.shape[1] @@ -125,8 +125,9 @@ def get_hf_prompt_tokens(model_name, content, image_url): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) -async def test_single_chat_session_image(client: openai.AsyncOpenAI, - model_name: str, image_url: str): +async def test_single_chat_session_image( + client: openai.AsyncOpenAI, model_name: str, image_url: str +): content_text = "What's in this image?" messages = dummy_messages_from_image_url(image_url, content_text) @@ -138,17 +139,18 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI, max_completion_tokens=max_completion_tokens, logprobs=True, temperature=0.0, - top_logprobs=5) + top_logprobs=5, + ) assert len(chat_completion.choices) == 1 choice = chat_completion.choices[0] assert choice.finish_reason == "length" - hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, - image_url) + hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url) assert chat_completion.usage == openai.types.CompletionUsage( completion_tokens=max_completion_tokens, prompt_tokens=hf_prompt_tokens, - total_tokens=hf_prompt_tokens + max_completion_tokens) + total_tokens=hf_prompt_tokens + max_completion_tokens, + ) message = choice.message message = chat_completion.choices[0].message @@ -170,39 +172,36 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) -async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI, - model_name: str, - image_url: str): +async def test_error_on_invalid_image_url_type( + client: openai.AsyncOpenAI, model_name: str, image_url: str +): content_text = "What's in this image?" - messages = [{ - "role": - "user", - "content": [ - { - "type": "image_url", - "image_url": image_url - }, - { - "type": "text", - "text": content_text - }, - ], - }] + messages = [ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": image_url}, + {"type": "text", "text": content_text}, + ], + } + ] # image_url should be a dict {"url": "some url"}, not directly a string with pytest.raises(openai.BadRequestError): - _ = await client.chat.completions.create(model=model_name, - messages=messages, - max_completion_tokens=10, - temperature=0.0) + _ = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + temperature=0.0, + ) @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) -async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI, - model_name: str, - image_url: str): +async def test_single_chat_session_image_beamsearch( + client: openai.AsyncOpenAI, model_name: str, image_url: str +): content_text = "What's in this image?" messages = dummy_messages_from_image_url(image_url, content_text) @@ -213,10 +212,13 @@ async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI, max_completion_tokens=10, logprobs=True, top_logprobs=5, - extra_body=dict(use_beam_search=True)) + extra_body=dict(use_beam_search=True), + ) assert len(chat_completion.choices) == 2 - assert chat_completion.choices[ - 0].message.content != chat_completion.choices[1].message.content + assert ( + chat_completion.choices[0].message.content + != chat_completion.choices[1].message.content + ) @pytest.mark.asyncio @@ -224,9 +226,12 @@ async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI, @pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS) @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_single_chat_session_image_base64encoded( - client: openai.AsyncOpenAI, model_name: str, raw_image_url: str, - image_url: str, base64_encoded_image: dict[str, str]): - + client: openai.AsyncOpenAI, + model_name: str, + raw_image_url: str, + image_url: str, + base64_encoded_image: dict[str, str], +): content_text = "What's in this image?" messages = dummy_messages_from_image_url( f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}", @@ -241,17 +246,18 @@ async def test_single_chat_session_image_base64encoded( max_completion_tokens=max_completion_tokens, logprobs=True, temperature=0.0, - top_logprobs=5) + top_logprobs=5, + ) assert len(chat_completion.choices) == 1 choice = chat_completion.choices[0] assert choice.finish_reason == "length" - hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, - image_url) + hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url) assert chat_completion.usage == openai.types.CompletionUsage( completion_tokens=max_completion_tokens, prompt_tokens=hf_prompt_tokens, - total_tokens=hf_prompt_tokens + max_completion_tokens) + total_tokens=hf_prompt_tokens + max_completion_tokens, + ) message = choice.message message = chat_completion.choices[0].message @@ -275,14 +281,18 @@ async def test_single_chat_session_image_base64encoded( @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_ASSETS)))) async def test_single_chat_session_image_base64encoded_beamsearch( - client: openai.AsyncOpenAI, model_name: str, image_idx: int, - base64_encoded_image: dict[str, str]): + client: openai.AsyncOpenAI, + model_name: str, + image_idx: int, + base64_encoded_image: dict[str, str], +): # NOTE: This test also validates that we pass MM data through beam search raw_image_url = TEST_IMAGE_ASSETS[image_idx] expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx] messages = dummy_messages_from_image_url( - f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}") + f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" + ) chat_completion = await client.chat.completions.create( model=model_name, @@ -290,7 +300,8 @@ async def test_single_chat_session_image_base64encoded_beamsearch( n=2, max_completion_tokens=10, temperature=0.0, - extra_body=dict(use_beam_search=True)) + extra_body=dict(use_beam_search=True), + ) assert len(chat_completion.choices) == 2 for actual, expected_str in zip(chat_completion.choices, expected_res): assert actual.message.content == expected_str @@ -299,8 +310,9 @@ async def test_single_chat_session_image_base64encoded_beamsearch( @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) -async def test_chat_streaming_image(client: openai.AsyncOpenAI, - model_name: str, image_url: str): +async def test_chat_streaming_image( + client: openai.AsyncOpenAI, model_name: str, image_url: str +): messages = dummy_messages_from_image_url(image_url) # test single completion @@ -343,9 +355,11 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI, @pytest.mark.parametrize( "image_urls", [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))], - indirect=True) -async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str, - image_urls: list[str]): + indirect=True, +) +async def test_multi_image_input( + client: openai.AsyncOpenAI, model_name: str, image_urls: list[str] +): messages = dummy_messages_from_image_url(image_urls) if len(image_urls) > MAXIMUM_IMAGES: @@ -382,7 +396,8 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str, @pytest.mark.parametrize( "image_urls", [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))], - indirect=True) + indirect=True, +) async def test_completions_with_image( client: openai.AsyncOpenAI, model_name: str, @@ -391,13 +406,9 @@ async def test_completions_with_image( for image_url in image_urls: chat_completion = await client.chat.completions.create( messages=[ + {"role": "system", "content": "You are a helpful assistant."}, { - "role": "system", - "content": "You are a helpful assistant." - }, - { - "role": - "user", + "role": "user", "content": [ { "type": "text", @@ -407,7 +418,7 @@ async def test_completions_with_image( "type": "image_url", "image_url": { "url": image_url, - } + }, }, ], }, @@ -424,7 +435,8 @@ async def test_completions_with_image( @pytest.mark.parametrize( "image_urls", [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))], - indirect=True) + indirect=True, +) async def test_completions_with_image_with_uuid( client: openai.AsyncOpenAI, model_name: str, @@ -433,13 +445,9 @@ async def test_completions_with_image_with_uuid( for image_url in image_urls: chat_completion = await client.chat.completions.create( messages=[ + {"role": "system", "content": "You are a helpful assistant."}, { - "role": "system", - "content": "You are a helpful assistant." - }, - { - "role": - "user", + "role": "user", "content": [ { "type": "text", @@ -450,7 +458,7 @@ async def test_completions_with_image_with_uuid( "image_url": { "url": image_url, }, - "uuid": image_url + "uuid": image_url, }, ], }, @@ -464,34 +472,25 @@ async def test_completions_with_image_with_uuid( # Second request, with empty image but the same uuid. chat_completion_with_empty_image = await client.chat.completions.create( messages=[ + {"role": "system", "content": "You are a helpful assistant."}, { - "role": "system", - "content": "You are a helpful assistant." - }, - { - "role": - "user", + "role": "user", "content": [ { "type": "text", "text": "Describe this image.", }, - { - "type": "image_url", - "image_url": {}, - "uuid": image_url - }, + {"type": "image_url", "image_url": {}, "uuid": image_url}, ], }, ], model=model_name, ) - assert chat_completion_with_empty_image.choices[ - 0].message.content is not None + assert chat_completion_with_empty_image.choices[0].message.content is not None assert isinstance( - chat_completion_with_empty_image.choices[0].message.content, str) - assert len( - chat_completion_with_empty_image.choices[0].message.content) > 0 + chat_completion_with_empty_image.choices[0].message.content, str + ) + assert len(chat_completion_with_empty_image.choices[0].message.content) > 0 @pytest.mark.asyncio @@ -503,13 +502,9 @@ async def test_completions_with_empty_image_with_uuid_without_cache_hit( with pytest.raises(openai.BadRequestError): _ = await client.chat.completions.create( messages=[ + {"role": "system", "content": "You are a helpful assistant."}, { - "role": "system", - "content": "You are a helpful assistant." - }, - { - "role": - "user", + "role": "user", "content": [ { "type": "text", @@ -518,7 +513,7 @@ async def test_completions_with_empty_image_with_uuid_without_cache_hit( { "type": "image_url", "image_url": {}, - "uuid": "uuid_not_previously_seen" + "uuid": "uuid_not_previously_seen", }, ], }, @@ -532,7 +527,8 @@ async def test_completions_with_empty_image_with_uuid_without_cache_hit( @pytest.mark.parametrize( "image_urls", [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))], - indirect=True) + indirect=True, +) async def test_completions_with_image_with_incorrect_uuid_format( client: openai.AsyncOpenAI, model_name: str, @@ -541,13 +537,9 @@ async def test_completions_with_image_with_incorrect_uuid_format( for image_url in image_urls: chat_completion = await client.chat.completions.create( messages=[ + {"role": "system", "content": "You are a helpful assistant."}, { - "role": "system", - "content": "You are a helpful assistant." - }, - { - "role": - "user", + "role": "user", "content": [ { "type": "text", diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index dddb8d57e7d9..070cd8072b1f 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -12,8 +12,11 @@ from vllm.multimodal.image import convert_image_mode from vllm.multimodal.inputs import PlaceholderRange -from vllm.multimodal.utils import (MediaConnector, allocate_gpu_mm_processors, - argsort_mm_positions) +from vllm.multimodal.utils import ( + MediaConnector, + allocate_gpu_mm_processors, + argsort_mm_positions, +) # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) TEST_IMAGE_ASSETS = [ @@ -31,7 +34,6 @@ @pytest.fixture(scope="module") def url_images(local_asset_server) -> dict[str, Image.Image]: - return { image_url: local_asset_server.get_image_asset(image_url) for image_url in TEST_IMAGE_ASSETS @@ -40,10 +42,10 @@ def url_images(local_asset_server) -> dict[str, Image.Image]: def get_supported_suffixes() -> tuple[str, ...]: # We should at least test the file types mentioned in GPT-4 with Vision - OPENAI_SUPPORTED_SUFFIXES = ('.png', '.jpeg', '.jpg', '.webp', '.gif') + OPENAI_SUPPORTED_SUFFIXES = (".png", ".jpeg", ".jpg", ".webp", ".gif") # Additional file types that are supported by us - EXTRA_SUPPORTED_SUFFIXES = ('.bmp', '.tiff') + EXTRA_SUPPORTED_SUFFIXES = (".bmp", ".tiff") return OPENAI_SUPPORTED_SUFFIXES + EXTRA_SUPPORTED_SUFFIXES @@ -65,8 +67,9 @@ async def test_fetch_image_http(image_url: str): @pytest.mark.asyncio @pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS) @pytest.mark.parametrize("suffix", get_supported_suffixes()) -async def test_fetch_image_base64(url_images: dict[str, Image.Image], - raw_image_url: str, suffix: str): +async def test_fetch_image_base64( + url_images: dict[str, Image.Image], raw_image_url: str, suffix: str +): connector = MediaConnector() url_image = url_images[raw_image_url] @@ -76,14 +79,14 @@ async def test_fetch_image_base64(url_images: dict[str, Image.Image], try: mime_type = mimetypes.types_map[suffix] except KeyError: - pytest.skip('No MIME type') + pytest.skip("No MIME type") with NamedTemporaryFile(suffix=suffix) as f: try: url_image.save(f.name) except Exception as e: - if e.args[0] == 'cannot write mode RGBA as JPEG': - pytest.skip('Conversion not supported') + if e.args[0] == "cannot write mode RGBA as JPEG": + pytest.skip("Conversion not supported") raise @@ -109,30 +112,36 @@ async def test_fetch_image_local_files(image_url: str): local_connector = MediaConnector(allowed_local_media_path=temp_dir) origin_image = connector.fetch_image(image_url) - origin_image.save(os.path.join(temp_dir, os.path.basename(image_url)), - quality=100, - icc_profile=origin_image.info.get('icc_profile')) + origin_image.save( + os.path.join(temp_dir, os.path.basename(image_url)), + quality=100, + icc_profile=origin_image.info.get("icc_profile"), + ) image_async = await local_connector.fetch_image_async( - f"file://{temp_dir}/{os.path.basename(image_url)}") + f"file://{temp_dir}/{os.path.basename(image_url)}" + ) image_sync = local_connector.fetch_image( - f"file://{temp_dir}/{os.path.basename(image_url)}") + f"file://{temp_dir}/{os.path.basename(image_url)}" + ) # Check that the images are equal assert not ImageChops.difference(image_sync, image_async).getbbox() with pytest.raises(ValueError, match="must be a subpath"): await local_connector.fetch_image_async( - f"file://{temp_dir}/../{os.path.basename(image_url)}") + f"file://{temp_dir}/../{os.path.basename(image_url)}" + ) with pytest.raises(RuntimeError, match="Cannot load local files"): await connector.fetch_image_async( - f"file://{temp_dir}/../{os.path.basename(image_url)}") + f"file://{temp_dir}/../{os.path.basename(image_url)}" + ) with pytest.raises(ValueError, match="must be a subpath"): local_connector.fetch_image( - f"file://{temp_dir}/../{os.path.basename(image_url)}") + f"file://{temp_dir}/../{os.path.basename(image_url)}" + ) with pytest.raises(RuntimeError, match="Cannot load local files"): - connector.fetch_image( - f"file://{temp_dir}/../{os.path.basename(image_url)}") + connector.fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}") @pytest.mark.asyncio @@ -145,18 +154,19 @@ async def test_fetch_image_local_files_with_space_in_name(image_url: str): origin_image = connector.fetch_image(image_url) filename = "file name with space.jpg" - origin_image.save(os.path.join(temp_dir, filename), - quality=100, - icc_profile=origin_image.info.get('icc_profile')) + origin_image.save( + os.path.join(temp_dir, filename), + quality=100, + icc_profile=origin_image.info.get("icc_profile"), + ) try: image_async = await local_connector.fetch_image_async( - f"file://{temp_dir}/{filename}") - image_sync = local_connector.fetch_image( - f"file://{temp_dir}/{filename}") + f"file://{temp_dir}/{filename}" + ) + image_sync = local_connector.fetch_image(f"file://{temp_dir}/{filename}") except FileNotFoundError as e: - pytest.fail( - "Failed to fetch image with space in name: {}".format(e)) + pytest.fail("Failed to fetch image with space in name: {}".format(e)) # Check that the images are equal assert not ImageChops.difference(image_sync, image_async).getbbox() @@ -179,9 +189,12 @@ async def test_fetch_image_error_conversion(): @pytest.mark.parametrize("num_frames", [-1, 32, 1800]) async def test_fetch_video_http(video_url: str, num_frames: int): connector = MediaConnector( - media_io_kwargs={"video": { - "num_frames": num_frames, - }}) + media_io_kwargs={ + "video": { + "num_frames": num_frames, + } + } + ) video_sync, metadata_sync = connector.fetch_video(video_url) video_async, metadata_async = await connector.fetch_video_async(video_url) @@ -194,8 +207,11 @@ async def test_fetch_video_http(video_url: str, num_frames: int): @pytest.mark.parametrize("max_duration", [1, 60, 1800]) @pytest.mark.parametrize("requested_fps", [2, 24]) async def test_fetch_video_http_with_dynamic_loader( - video_url: str, max_duration: int, requested_fps: int, - monkeypatch: pytest.MonkeyPatch): + video_url: str, + max_duration: int, + requested_fps: int, + monkeypatch: pytest.MonkeyPatch, +): with monkeypatch.context() as m: m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic") connector = MediaConnector( @@ -204,11 +220,11 @@ async def test_fetch_video_http_with_dynamic_loader( "max_duration": max_duration, "requested_fps": requested_fps, } - }) + } + ) video_sync, metadata_sync = connector.fetch_video(video_url) - video_async, metadata_async = await connector.fetch_video_async( - video_url) + video_async, metadata_async = await connector.fetch_video_async(video_url) assert np.array_equal(video_sync, video_async) assert metadata_sync == metadata_async diff --git a/tests/v1/engine/test_processor_multi_modal_uuids.py b/tests/v1/engine/test_processor_multi_modal_uuids.py index c45a2c4344a1..b54c44a26f53 100644 --- a/tests/v1/engine/test_processor_multi_modal_uuids.py +++ b/tests/v1/engine/test_processor_multi_modal_uuids.py @@ -16,35 +16,33 @@ # Mock processor for testing -def _mk_processor(monkeypatch, - *, - mm_cache_gb: float = 4.0, - enable_prefix_caching: bool = True) -> Processor: +def _mk_processor( + monkeypatch, *, mm_cache_gb: float = 4.0, enable_prefix_caching: bool = True +) -> Processor: """ Create a Processor instance with minimal configuration suitable for unit tests without accessing external resources. """ - monkeypatch.setattr(ModelConfig, - "try_get_generation_config", - lambda self: {}, - raising=True) - monkeypatch.setattr(ModelConfig, - "__post_init__", - lambda self, *args: None, - raising=True) - monkeypatch.setattr(ModelConfig, - "verify_with_parallel_config", - lambda self, parallel_config: None, - raising=True) - monkeypatch.setattr(processor_mod, - "processor_cache_from_config", - lambda vllm_config, mm_registry: None, - raising=True) - - monkeypatch.setattr(VllmConfig, - "__post_init__", - lambda self: None, - raising=True) + monkeypatch.setattr( + ModelConfig, "try_get_generation_config", lambda self: {}, raising=True + ) + monkeypatch.setattr( + ModelConfig, "__post_init__", lambda self, *args: None, raising=True + ) + monkeypatch.setattr( + ModelConfig, + "verify_with_parallel_config", + lambda self, parallel_config: None, + raising=True, + ) + monkeypatch.setattr( + processor_mod, + "processor_cache_from_config", + lambda vllm_config, mm_registry: None, + raising=True, + ) + + monkeypatch.setattr(VllmConfig, "__post_init__", lambda self: None, raising=True) model_config = ModelConfig( skip_tokenizer_init=True, @@ -57,13 +55,11 @@ def _mk_processor(monkeypatch, # Minimal multimodal_config to satisfy references in # Processor.process_inputs. class _MockMMConfig: - def __init__(self, gb: float): self.mm_processor_cache_gb = gb self.mm_processing_device = "cpu" - model_config.multimodal_config = _MockMMConfig( - mm_cache_gb) # type: ignore[attr-defined] + model_config.multimodal_config = _MockMMConfig(mm_cache_gb) # type: ignore[attr-defined] vllm_config = VllmConfig( model_config=model_config, cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching), @@ -80,13 +76,9 @@ def test_multi_modal_uuids_length_mismatch_raises(monkeypatch): prompt = { "prompt": "USER: \nDescribe\nASSISTANT:", - "multi_modal_data": { - "image": [cherry_pil_image, stop_pil_image] - }, + "multi_modal_data": {"image": [cherry_pil_image, stop_pil_image]}, # Mismatch: 2 items but only 1 uuid provided - "multi_modal_uuids": { - "image": ["hash_cherry"] - }, + "multi_modal_uuids": {"image": ["hash_cherry"]}, } with pytest.raises(ValueError, match="must have same length as data"): @@ -105,16 +97,13 @@ def test_multi_modal_uuids_missing_modality_raises(monkeypatch): # Two modalities provided in data "multi_modal_data": { "image": [cherry_pil_image], - "video": [baby_reading_np_ndarrays] + "video": [baby_reading_np_ndarrays], }, # Only image uuids provided; video missing should raise - "multi_modal_uuids": { - "image": ["hash_cherry"] - }, + "multi_modal_uuids": {"image": ["hash_cherry"]}, } - with pytest.raises(ValueError, - match="must be provided if multi_modal_data"): + with pytest.raises(ValueError, match="must be provided if multi_modal_data"): processor.process_inputs( request_id="req-2", prompt=prompt, # type: ignore[arg-type] @@ -131,28 +120,28 @@ def test_multi_modal_uuids_missing_modality_raises(monkeypatch): ], ) def test_multi_modal_uuids_accepts_none_and_passes_through( - monkeypatch, mm_cache_gb: float, enable_prefix_caching: bool): - processor = _mk_processor(monkeypatch, - mm_cache_gb=mm_cache_gb, - enable_prefix_caching=enable_prefix_caching) + monkeypatch, mm_cache_gb: float, enable_prefix_caching: bool +): + processor = _mk_processor( + monkeypatch, + mm_cache_gb=mm_cache_gb, + enable_prefix_caching=enable_prefix_caching, + ) # Capture the overrides passed to InputPreprocessor.preprocess captured: dict[str, object] = {} - def fake_preprocess(prompt, - *, - tokenization_kwargs=None, - lora_request=None, - mm_uuids=None): + def fake_preprocess( + prompt, *, tokenization_kwargs=None, lora_request=None, mm_uuids=None + ): captured["mm_uuids"] = mm_uuids # Minimal processed inputs for decoder-only flow return {"type": "token", "prompt_token_ids": [1]} # Monkeypatch only the bound preprocess method on this instance - monkeypatch.setattr(processor.input_preprocessor, - "preprocess", - fake_preprocess, - raising=True) + monkeypatch.setattr( + processor.input_preprocessor, "preprocess", fake_preprocess, raising=True + ) # Use a consistent two-image scenario across all configurations mm_uuids = {"image": [None, "hash_stop"], "video": None} @@ -177,24 +166,19 @@ def fake_preprocess(prompt, def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch): # When both processor cache is 0 and prefix caching disabled, the # processor builds overrides from request id instead of using user UUIDs. - processor = _mk_processor(monkeypatch, - mm_cache_gb=0.0, - enable_prefix_caching=False) + processor = _mk_processor(monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False) captured: dict[str, object] = {} - def fake_preprocess(prompt, - *, - tokenization_kwargs=None, - lora_request=None, - mm_uuids=None): + def fake_preprocess( + prompt, *, tokenization_kwargs=None, lora_request=None, mm_uuids=None + ): captured["mm_uuids"] = mm_uuids return {"type": "token", "prompt_token_ids": [1]} - monkeypatch.setattr(processor.input_preprocessor, - "preprocess", - fake_preprocess, - raising=True) + monkeypatch.setattr( + processor.input_preprocessor, "preprocess", fake_preprocess, raising=True + ) request_id = "req-42" mm_uuids = {"image": ["hash_cherry", "hash_stop"], "video": "hash_video"} diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 91a2ac3aaac8..9f6821bbcdf1 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -12,8 +12,7 @@ from contextlib import contextmanager from dataclasses import field, fields, is_dataclass, replace from functools import cached_property, lru_cache -from typing import (TYPE_CHECKING, Any, Literal, Optional, Protocol, TypeVar, - Union, cast) +from typing import TYPE_CHECKING, Any, Literal, Optional, Protocol, TypeVar, Union, cast import regex as re import torch @@ -23,23 +22,38 @@ import vllm.envs as envs from vllm import version -from vllm.config.cache import (BlockSize, CacheConfig, CacheDType, MambaDType, - PrefixCachingHashAlgo) -from vllm.config.compilation import (CompilationConfig, CompilationLevel, - CUDAGraphMode, PassConfig) +from vllm.config.cache import ( + BlockSize, + CacheConfig, + CacheDType, + MambaDType, + PrefixCachingHashAlgo, +) +from vllm.config.compilation import ( + CompilationConfig, + CompilationLevel, + CUDAGraphMode, + PassConfig, +) from vllm.config.kv_events import KVEventsConfig from vllm.config.kv_transfer import KVTransferConfig from vllm.config.load import LoadConfig from vllm.config.lora import LoRAConfig -from vllm.config.model import (ConvertOption, HfOverrides, LogprobsMode, - ModelConfig, ModelDType, ModelImpl, - RunnerOption, TaskOption, TokenizerMode, - iter_architecture_defaults, - try_match_architecture_defaults) -from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode, - MultiModalConfig) -from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig, - ParallelConfig) +from vllm.config.model import ( + ConvertOption, + HfOverrides, + LogprobsMode, + ModelConfig, + ModelDType, + ModelImpl, + RunnerOption, + TaskOption, + TokenizerMode, + iter_architecture_defaults, + try_match_architecture_defaults, +) +from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MultiModalConfig +from vllm.config.parallel import DistributedExecutorBackend, EPLBConfig, ParallelConfig from vllm.config.pooler import PoolerConfig from vllm.config.scheduler import RunnerType, SchedulerConfig, SchedulerPolicy from vllm.config.speculative import SpeculativeConfig @@ -54,8 +68,7 @@ from _typeshed import DataclassInstance from transformers.configuration_utils import PretrainedConfig - from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) + from vllm.model_executor.layers.quantization.base_config import QuantizationConfig else: DataclassInstance = Any PretrainedConfig = Any @@ -70,15 +83,11 @@ @runtime_checkable class SupportsHash(Protocol): - - def compute_hash(self) -> str: - ... + def compute_hash(self) -> str: ... class SupportsMetricsInfo(Protocol): - - def metrics_info(self) -> dict[str, str]: - ... + def metrics_info(self) -> dict[str, str]: ... Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"] @@ -115,20 +124,21 @@ def compute_hash(self) -> str: # the device/platform information will be summarized # by torch/vllm automatically. factors: list[Any] = [] - hash_str = hashlib.md5(str(factors).encode(), - usedforsecurity=False).hexdigest() + hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str def __post_init__(self): if self.device == "auto": # Automated device type detection from vllm.platforms import current_platform + self.device_type = current_platform.device_type if not self.device_type: raise RuntimeError( "Failed to infer device type, please set " "the environment variable `VLLM_LOGGING_LEVEL=DEBUG` " - "to turn on verbose logging to help debug the issue.") + "to turn on verbose logging to help debug the issue." + ) else: # Device type is assigned explicitly if isinstance(self.device, str): @@ -165,8 +175,7 @@ def show_hidden_metrics(self) -> bool: """Check if the hidden metrics should be shown.""" if self.show_hidden_metrics_for_version is None: return False - return version._prev_minor_version_was( - self.show_hidden_metrics_for_version) + return version._prev_minor_version_was(self.show_hidden_metrics_for_version) otlp_traces_endpoint: Optional[str] = None """Target URL to which OpenTelemetry traces will be sent.""" @@ -183,16 +192,18 @@ def show_hidden_metrics(self) -> bool: @cached_property def collect_model_forward_time(self) -> bool: """Whether to collect model forward time for the request.""" - return (self.collect_detailed_traces is not None - and ("model" in self.collect_detailed_traces - or "all" in self.collect_detailed_traces)) + return self.collect_detailed_traces is not None and ( + "model" in self.collect_detailed_traces + or "all" in self.collect_detailed_traces + ) @cached_property def collect_model_execute_time(self) -> bool: """Whether to collect model execute time for the request.""" - return (self.collect_detailed_traces is not None - and ("worker" in self.collect_detailed_traces - or "all" in self.collect_detailed_traces)) + return self.collect_detailed_traces is not None and ( + "worker" in self.collect_detailed_traces + or "all" in self.collect_detailed_traces + ) def compute_hash(self) -> str: """ @@ -209,28 +220,31 @@ def compute_hash(self) -> str: # no factors to consider. # this config will not affect the computation graph. factors: list[Any] = [] - hash_str = hashlib.md5(str(factors).encode(), - usedforsecurity=False).hexdigest() + hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str def __post_init__(self): - if (self.collect_detailed_traces is not None - and len(self.collect_detailed_traces) == 1 - and "," in self.collect_detailed_traces[0]): + if ( + self.collect_detailed_traces is not None + and len(self.collect_detailed_traces) == 1 + and "," in self.collect_detailed_traces[0] + ): self._parse_collect_detailed_traces() from vllm.tracing import is_otel_available, otel_import_error_traceback + if not is_otel_available() and self.otlp_traces_endpoint is not None: raise ValueError( "OpenTelemetry is not available. Unable to configure " "'otlp_traces_endpoint'. Ensure OpenTelemetry packages are " - f"installed. Original error:\n{otel_import_error_traceback}") + f"installed. Original error:\n{otel_import_error_traceback}" + ) def _parse_collect_detailed_traces(self): assert isinstance(self.collect_detailed_traces, list) self.collect_detailed_traces = cast( - list[DetailedTraceModules], - self.collect_detailed_traces[0].split(",")) + list[DetailedTraceModules], self.collect_detailed_traces[0].split(",") + ) @config @@ -259,14 +273,14 @@ class VllmConfig: speculative_config: Optional[SpeculativeConfig] = None """Speculative decoding configuration.""" structured_outputs_config: StructuredOutputsConfig = field( - default_factory=StructuredOutputsConfig) + default_factory=StructuredOutputsConfig + ) """Structured outputs configuration.""" observability_config: Optional[ObservabilityConfig] = None """Observability configuration.""" quant_config: Optional[QuantizationConfig] = None """Quantization configuration.""" - compilation_config: CompilationConfig = field( - default_factory=CompilationConfig) + compilation_config: CompilationConfig = field(default_factory=CompilationConfig) """`torch.compile` and cudagraph capture configuration for the model. As a shorthand, `-O` can be used to directly specify the compilation @@ -312,6 +326,7 @@ def compute_hash(self) -> str: # summarize vllm config vllm_factors: list[Any] = [] from vllm import __version__ + vllm_factors.append(__version__) vllm_factors.append(envs.VLLM_USE_V1) if self.model_config: @@ -343,8 +358,7 @@ def compute_hash(self) -> str: # LoRA creates static buffers based on max_num_batched_tokens. # The tensor sizes and strides get captured in the torch.compile # graph explicitly. - vllm_factors.append( - str(self.scheduler_config.max_num_batched_tokens)) + vllm_factors.append(str(self.scheduler_config.max_num_batched_tokens)) else: vllm_factors.append("None") if self.speculative_config: @@ -382,8 +396,9 @@ def compute_hash(self) -> str: vllm_factors.append("None") factors.append(vllm_factors) - hash_str = hashlib.md5(str(factors).encode(), - usedforsecurity=False).hexdigest()[:10] + hash_str = hashlib.md5( + str(factors).encode(), usedforsecurity=False + ).hexdigest()[:10] return hash_str def pad_for_cudagraph(self, batch_size: int) -> int: @@ -395,13 +410,14 @@ def pad_for_cudagraph(self, batch_size: int) -> int: @staticmethod def _get_quantization_config( - model_config: ModelConfig, - load_config: LoadConfig) -> Optional[QuantizationConfig]: + model_config: ModelConfig, load_config: LoadConfig + ) -> Optional[QuantizationConfig]: """Get the quantization config.""" from vllm.platforms import current_platform + if model_config.quantization is not None: - from vllm.model_executor.model_loader.weight_utils import ( - get_quant_config) + from vllm.model_executor.model_loader.weight_utils import get_quant_config + quant_config = get_quant_config(model_config, load_config) capability_tuple = current_platform.get_device_capability() @@ -412,26 +428,29 @@ def _get_quantization_config( f"The quantization method {model_config.quantization} " "is not supported for the current GPU. Minimum " f"capability: {quant_config.get_min_capability()}. " - f"Current capability: {capability}.") + f"Current capability: {capability}." + ) supported_dtypes = quant_config.get_supported_act_dtypes() if model_config.dtype not in supported_dtypes: raise ValueError( f"{model_config.dtype} is not supported for quantization " f"method {model_config.quantization}. Supported dtypes: " - f"{supported_dtypes}") + f"{supported_dtypes}" + ) return quant_config return None @staticmethod def get_quantization_config( - model_config: ModelConfig, - load_config: LoadConfig) -> Optional[QuantizationConfig]: + model_config: ModelConfig, load_config: LoadConfig + ) -> Optional[QuantizationConfig]: import copy # For some reason, the _ version of this modifies the model_config # object, so using deepcopy to avoid this problem. - return VllmConfig._get_quantization_config(copy.deepcopy(model_config), - load_config) + return VllmConfig._get_quantization_config( + copy.deepcopy(model_config), load_config + ) def with_hf_config( self, @@ -448,15 +467,13 @@ def with_hf_config( return replace(self, model_config=model_config) def __post_init__(self): - """Verify configs are valid & consistent with each other. - """ + """Verify configs are valid & consistent with each other.""" self.try_verify_and_update_config() if self.model_config is not None: self.model_config.verify_with_parallel_config(self.parallel_config) - self.model_config.verify_dual_chunk_attention_config( - self.load_config) + self.model_config.verify_dual_chunk_attention_config(self.load_config) self.cache_config.verify_with_parallel_config(self.parallel_config) @@ -466,29 +483,35 @@ def __post_init__(self): if self.quant_config is None and self.model_config is not None: self.quant_config = VllmConfig._get_quantization_config( - self.model_config, self.load_config) + self.model_config, self.load_config + ) from vllm.platforms import current_platform - if self.model_config is not None and \ - self.scheduler_config.chunked_prefill_enabled and \ - self.model_config.dtype == torch.float32 and \ - current_platform.get_device_capability() == (7, 5): + + if ( + self.model_config is not None + and self.scheduler_config.chunked_prefill_enabled + and self.model_config.dtype == torch.float32 + and current_platform.get_device_capability() == (7, 5) + ): logger.warning_once( "Turing devices tensor cores do not support float32 matmul. " "To workaround this limitation, vLLM will set 'ieee' input " - "precision for chunked prefill triton kernels.") + "precision for chunked prefill triton kernels." + ) # If the user does not explicitly set a compilation level, then # we use the default level. The default level depends on other # settings (see the below code). if self.compilation_config.level is None: if envs.VLLM_USE_V1: - if (self.model_config is not None - and not self.model_config.enforce_eager): + if ( + self.model_config is not None + and not self.model_config.enforce_eager + ): self.compilation_config.level = CompilationLevel.PIECEWISE else: - self.compilation_config.level = \ - CompilationLevel.NO_COMPILATION + self.compilation_config.level = CompilationLevel.NO_COMPILATION else: # NB: Passing both --enforce-eager and a compilation level @@ -498,8 +521,7 @@ def __post_init__(self): # async tp is built on top of sequence parallelism # and requires it to be enabled. if self.compilation_config.pass_config.enable_async_tp: - self.compilation_config.pass_config.enable_sequence_parallelism = \ - True + self.compilation_config.pass_config.enable_sequence_parallelism = True if self.compilation_config.pass_config.enable_sequence_parallelism: self.compilation_config.custom_ops.append("+rms_norm") @@ -507,23 +529,26 @@ def __post_init__(self): # if cudagraph_mode is not explicitly set by users, set default # value if self.compilation_config.cudagraph_mode is None: - if envs.VLLM_USE_V1 and self.compilation_config.level \ - == CompilationLevel.PIECEWISE: + if ( + envs.VLLM_USE_V1 + and self.compilation_config.level == CompilationLevel.PIECEWISE + ): # default to full and piecewise for most models - self.compilation_config.cudagraph_mode = \ + self.compilation_config.cudagraph_mode = ( CUDAGraphMode.FULL_AND_PIECEWISE + ) # pooling model does not support full cudagraphs - if self.model_config is not None and \ - self.model_config.pooler_config is not None: - self.compilation_config.cudagraph_mode = \ - CUDAGraphMode.PIECEWISE + if ( + self.model_config is not None + and self.model_config.pooler_config is not None + ): + self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE else: self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE # disable cudagraph when enforce eager execution - if self.model_config is not None and \ - self.model_config.enforce_eager: + if self.model_config is not None and self.model_config.enforce_eager: logger.info("Cudagraph is disabled under eager mode") self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE elif envs.VLLM_USE_V1: @@ -533,38 +558,49 @@ def __post_init__(self): else: self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE - if self.cache_config.cpu_offload_gb > 0 and \ - self.compilation_config.level != CompilationLevel.NO_COMPILATION \ - and not envs.VLLM_USE_V1: + if ( + self.cache_config.cpu_offload_gb > 0 + and self.compilation_config.level != CompilationLevel.NO_COMPILATION + and not envs.VLLM_USE_V1 + ): logger.warning( "CPU offload is not supported with `torch.compile` in v0 yet." - " Disabling `torch.compile`.") + " Disabling `torch.compile`." + ) self.compilation_config.level = CompilationLevel.NO_COMPILATION if self.cache_config.kv_sharing_fast_prefill: if not envs.VLLM_USE_V1: raise NotImplementedError( "Fast prefill optimization for KV sharing is not supported " - "in V0 currently.") + "in V0 currently." + ) - if self.speculative_config is not None and \ - self.speculative_config.use_eagle(): + if ( + self.speculative_config is not None + and self.speculative_config.use_eagle() + ): raise NotImplementedError( "Fast prefill optimization for KV sharing is not " "compatible with EAGLE as EAGLE requires correct logits " "for all tokens while fast prefill gives incorrect logits " - "for prompt tokens.") + "for prompt tokens." + ) logger.warning_once( "--kv-sharing-fast-prefill requires changes on model side for " - "correctness and to realize prefill savings. ") - - if ((not envs.VLLM_USE_V1) and self.lora_config is not None - and self.compilation_config.level - != CompilationLevel.NO_COMPILATION): + "correctness and to realize prefill savings. " + ) + + if ( + (not envs.VLLM_USE_V1) + and self.lora_config is not None + and self.compilation_config.level != CompilationLevel.NO_COMPILATION + ): logger.warning( "LoRA for V0 is not supported with `torch.compile` yet. " - "Disabling `torch.compile`.") + "Disabling `torch.compile`." + ) self.compilation_config.level = CompilationLevel.NO_COMPILATION disable_chunked_prefill_reasons: list[str] = [] @@ -574,32 +610,38 @@ def __post_init__(self): pooling_type = self.model_config.pooler_config.pooling_type if pooling_type is None or pooling_type.lower() != "last": disable_chunked_prefill_reasons.append( - "Only \"last\" pooling supports chunked " - "prefill and prefix caching; disabling both.") + 'Only "last" pooling supports chunked ' + "prefill and prefix caching; disabling both." + ) if not getattr(self.model_config.hf_config, "is_causal", True): disable_chunked_prefill_reasons.append( "Only models using causal attention supports chunked " - "prefill and prefix caching; disabling both.") + "prefill and prefix caching; disabling both." + ) elif self.model_config.is_encoder_decoder: - self.scheduler_config.max_num_encoder_input_tokens = \ + self.scheduler_config.max_num_encoder_input_tokens = ( MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config) + ) logger.debug( "Encoder-decoder model detected: setting " "`max_num_encoder_input_tokens` to encoder length (%s)", - self.scheduler_config.max_num_encoder_input_tokens) + self.scheduler_config.max_num_encoder_input_tokens, + ) self.scheduler_config.disable_chunked_mm_input = True disable_chunked_prefill_reasons.append( "Encoder-decoder models do not support chunked prefill nor" - " prefix caching; disabling both.") - if (self.model_config.architecture - == "WhisperForConditionalGeneration" - and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") - != "spawn"): + " prefix caching; disabling both." + ) + if ( + self.model_config.architecture == "WhisperForConditionalGeneration" + and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn" + ): logger.warning( "Whisper is known to have issues with " "forked workers. If startup is hanging, " "try setting 'VLLM_WORKER_MULTIPROC_METHOD' " - "to 'spawn'.") + "to 'spawn'." + ) if disable_chunked_prefill_reasons: for reason in disable_chunked_prefill_reasons: @@ -610,56 +652,67 @@ def __post_init__(self): if self.cache_config is not None: self.cache_config.enable_prefix_caching = False - if (self.kv_events_config is not None - and self.kv_events_config.enable_kv_cache_events - and not self.cache_config.enable_prefix_caching): + if ( + self.kv_events_config is not None + and self.kv_events_config.enable_kv_cache_events + and not self.cache_config.enable_prefix_caching + ): logger.warning( "KV cache events are on, but prefix caching is not enabled." - "Use --enable-prefix-caching to enable.") - if (self.kv_events_config is not None - and self.kv_events_config.publisher != "null" - and not self.kv_events_config.enable_kv_cache_events): - logger.warning("KV cache events are disabled," - "but the scheduler is configured to publish them." - "Modify KVEventsConfig.enable_kv_cache_events" - "to True to enable.") + "Use --enable-prefix-caching to enable." + ) + if ( + self.kv_events_config is not None + and self.kv_events_config.publisher != "null" + and not self.kv_events_config.enable_kv_cache_events + ): + logger.warning( + "KV cache events are disabled," + "but the scheduler is configured to publish them." + "Modify KVEventsConfig.enable_kv_cache_events" + "to True to enable." + ) current_platform.check_and_update_config(self) # final check of cudagraph mode after platform-specific update if envs.VLLM_USE_V1 and current_platform.is_cuda_alike(): - if self.compilation_config.cudagraph_mode == CUDAGraphMode.FULL \ - and self.model_config is not None and \ - not self.model_config.disable_cascade_attn: - logger.info("CUDAGraphMode.FULL is not supported with " - "cascade attention currently. Disabling cascade" - "attention.") + if ( + self.compilation_config.cudagraph_mode == CUDAGraphMode.FULL + and self.model_config is not None + and not self.model_config.disable_cascade_attn + ): + logger.info( + "CUDAGraphMode.FULL is not supported with " + "cascade attention currently. Disabling cascade" + "attention." + ) self.model_config.disable_cascade_attn = True - if self.compilation_config.cudagraph_mode\ - .requires_piecewise_compilation(): - assert self.compilation_config.level == \ - CompilationLevel.PIECEWISE, \ - "Compilation level should be CompilationLevel.PIECEWISE "\ - "when cudagraph_mode piecewise cudagraphs is used, "\ + if self.compilation_config.cudagraph_mode.requires_piecewise_compilation(): + assert self.compilation_config.level == CompilationLevel.PIECEWISE, ( + "Compilation level should be CompilationLevel.PIECEWISE " + "when cudagraph_mode piecewise cudagraphs is used, " f"cudagraph_mode={self.compilation_config.cudagraph_mode}" + ) if self.parallel_config.enable_dbo: a2a_backend = envs.VLLM_ALL2ALL_BACKEND - assert a2a_backend in \ - ["deepep_low_latency", "deepep_high_throughput"], \ - "Microbatching currently only supports the deepep_low_latency and "\ - f"deepep_high_throughput all2all backend. {a2a_backend} is not "\ - "supported. To fix set the VLLM_ALL2ALL_BACKEND environment "\ - "variable to deepep_low_latency or deepep_high_throughput and "\ - "install the DeepEP kernels." - - mm_config = (self.model_config.multimodal_config - if self.model_config else None) + assert a2a_backend in ["deepep_low_latency", "deepep_high_throughput"], ( + "Microbatching currently only supports the deepep_low_latency and " + f"deepep_high_throughput all2all backend. {a2a_backend} is not " + "supported. To fix set the VLLM_ALL2ALL_BACKEND environment " + "variable to deepep_low_latency or deepep_high_throughput and " + "install the DeepEP kernels." + ) + + mm_config = self.model_config.multimodal_config if self.model_config else None if mm_config and mm_config.mm_processing_device != "cpu": api_process_count = self.parallel_config._api_process_count api_process_rank = self.parallel_config._api_process_rank - local_gpu_count = (self.parallel_config.data_parallel_size_local * - self.parallel_config.world_size) + local_gpu_count = ( + self.parallel_config.data_parallel_size_local + * self.parallel_config.world_size + ) if api_process_rank != -1: from vllm.multimodal.utils import allocate_gpu_mm_processors @@ -674,8 +727,7 @@ def __post_init__(self): ) device = gpu_allocation[api_process_rank] - logger.info("Multi-modal processor will be run on device %s", - device) + logger.info("Multi-modal processor will be run on device %s", device) self.parallel_config._renderer_gpu_allocation = gpu_allocation mm_config.mm_processing_device = device @@ -684,12 +736,16 @@ def __post_init__(self): self.instance_id = random_uuid()[:5] # Do this after all the updates to compilation_config.level - if envs.VLLM_USE_V1 and \ - self.compilation_config.level == CompilationLevel.PIECEWISE: + if ( + envs.VLLM_USE_V1 + and self.compilation_config.level == CompilationLevel.PIECEWISE + ): self.compilation_config.set_splitting_ops_for_v1() - if (envs.VLLM_USE_V1 - and not self.scheduler_config.disable_hybrid_kv_cache_manager): + if ( + envs.VLLM_USE_V1 + and not self.scheduler_config.disable_hybrid_kv_cache_manager + ): # logger should only print warning message for hybrid models. As we # can't know whether the model is hybrid or not now, so we don't log # warning message here and will log it later. @@ -702,15 +758,18 @@ def __post_init__(self): if self.kv_events_config is not None: # Hybrid KV cache manager is not compatible with KV events. self.scheduler_config.disable_hybrid_kv_cache_manager = True - if self.model_config is not None and \ - self.model_config.attention_chunk_size is not None: - if self.speculative_config is not None and \ - self.speculative_config.use_eagle(): + if ( + self.model_config is not None + and self.model_config.attention_chunk_size is not None + ): + if ( + self.speculative_config is not None + and self.speculative_config.use_eagle() + ): # Hybrid KV cache manager is not yet supported with chunked # local attention + eagle. self.scheduler_config.disable_hybrid_kv_cache_manager = True - elif \ - not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: + elif not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: logger.warning( "There is a latency regression when using chunked local" " attention with the hybrid KV cache manager. Disabling" @@ -738,23 +797,26 @@ def has_blocked_weights(): if "none" not in custom_ops and "-quant_fp8" not in custom_ops: custom_ops.append("+quant_fp8") - def update_sizes_for_sequence_parallelism(self, - possible_sizes: list) -> list: + def update_sizes_for_sequence_parallelism(self, possible_sizes: list) -> list: # remove the sizes that not multiple of tp_size when # enable sequence parallelism removed_sizes = [ - size for size in possible_sizes + size + for size in possible_sizes if size % self.parallel_config.tensor_parallel_size != 0 ] if removed_sizes: logger.warning( "Batch sizes %s are removed because they are not " "multiple of tp_size %d when " - "sequence parallelism is enabled", removed_sizes, - self.parallel_config.tensor_parallel_size) + "sequence parallelism is enabled", + removed_sizes, + self.parallel_config.tensor_parallel_size, + ) return [ - size for size in possible_sizes + size + for size in possible_sizes if size % self.parallel_config.tensor_parallel_size == 0 ] @@ -799,21 +861,24 @@ def _set_cudagraph_sizes(self): # calculate the default `batch_size_capture_list` if not envs.VLLM_USE_V1: batch_size_capture_list = [] - if self.scheduler_config is not None and \ - self.model_config is not None and \ - not self.model_config.enforce_eager: - + if ( + self.scheduler_config is not None + and self.model_config is not None + and not self.model_config.enforce_eager + ): possible_sizes = [1, 2, 4] + [8 * i for i in range(1, 1025)] - if self.parallel_config.tensor_parallel_size > 1 and \ - self.compilation_config.pass_config.enable_sequence_parallelism: + if ( + self.parallel_config.tensor_parallel_size > 1 + and self.compilation_config.pass_config.enable_sequence_parallelism + ): possible_sizes = self.update_sizes_for_sequence_parallelism( - possible_sizes) + possible_sizes + ) # find the minimum size that is larger than max_num_seqs, # which then becomes the max_batchsize_to_capture larger_sizes = [ - x for x in possible_sizes - if x >= self.scheduler_config.max_num_seqs + x for x in possible_sizes if x >= self.scheduler_config.max_num_seqs ] if larger_sizes: max_batchsize_to_capture = larger_sizes[0] @@ -823,13 +888,11 @@ def _set_cudagraph_sizes(self): # filter out the sizes that are # larger than max_batchsize_to_capture batch_size_capture_list = [ - size for size in possible_sizes - if size <= max_batchsize_to_capture + size for size in possible_sizes if size <= max_batchsize_to_capture ] else: batch_size_capture_list = [] - if self.model_config is not None and \ - not self.model_config.enforce_eager: + if self.model_config is not None and not self.model_config.enforce_eager: cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes if len(cuda_graph_sizes) == 1: batch_size_capture_list = [1, 2, 4] + [ @@ -839,18 +902,21 @@ def _set_cudagraph_sizes(self): batch_size_capture_list = sorted(cuda_graph_sizes) else: raise TypeError(f"Invalid value for {cuda_graph_sizes=}.") - if self.parallel_config.tensor_parallel_size > 1 and \ - self.compilation_config.pass_config.enable_sequence_parallelism: - batch_size_capture_list = \ - self.update_sizes_for_sequence_parallelism(batch_size_capture_list) + if ( + self.parallel_config.tensor_parallel_size > 1 + and self.compilation_config.pass_config.enable_sequence_parallelism + ): + batch_size_capture_list = ( + self.update_sizes_for_sequence_parallelism( + batch_size_capture_list + ) + ) max_num_tokens = self.scheduler_config.max_num_batched_tokens batch_size_capture_list = [ - size for size in batch_size_capture_list - if size <= max_num_tokens + size for size in batch_size_capture_list if size <= max_num_tokens ] - self.compilation_config.init_with_cudagraph_sizes( - batch_size_capture_list) + self.compilation_config.init_with_cudagraph_sizes(batch_size_capture_list) def recalculate_max_model_len(self, max_model_len: int): # Can only be called in try_verify_and_update_config @@ -873,7 +939,10 @@ def try_verify_and_update_config(self): return from vllm.model_executor.models.config import ( - MODELS_CONFIG_MAP, HybridAttentionMambaModelConfig) + MODELS_CONFIG_MAP, + HybridAttentionMambaModelConfig, + ) + cls = MODELS_CONFIG_MAP.get(architecture, None) if cls is not None: cls.verify_and_update_config(self) @@ -883,21 +952,26 @@ def try_verify_and_update_config(self): if self.model_config.convert_type == "classify": # Maybe convert ForCausalLM into ForSequenceClassification model. - from vllm.model_executor.models.adapters import ( - SequenceClassificationConfig) + from vllm.model_executor.models.adapters import SequenceClassificationConfig + SequenceClassificationConfig.verify_and_update_config(self) if hasattr(self.model_config, "model_weights") and is_runai_obj_uri( - self.model_config.model_weights): + self.model_config.model_weights + ): if self.load_config.load_format == "auto": - logger.info("Detected Run:ai model config. " - "Overriding `load_format` to 'runai_streamer'") + logger.info( + "Detected Run:ai model config. " + "Overriding `load_format` to 'runai_streamer'" + ) self.load_config.load_format = "runai_streamer" elif self.load_config.load_format != "runai_streamer": - raise ValueError(f"To load a model from S3, 'load_format' " - f"must be 'runai_streamer', " - f"but got '{self.load_config.load_format}'. " - f"Model: {self.model_config.model}") + raise ValueError( + f"To load a model from S3, 'load_format' " + f"must be 'runai_streamer', " + f"but got '{self.load_config.load_format}'. " + f"Model: {self.model_config.model}" + ) def __str__(self): return ( @@ -928,7 +1002,8 @@ def __str__(self): f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, " f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, " # noqa f"pooler_config={self.model_config.pooler_config!r}, " - f"compilation_config={self.compilation_config!r}") + f"compilation_config={self.compilation_config!r}" + ) _current_vllm_config: Optional[VllmConfig] = None @@ -936,9 +1011,9 @@ def __str__(self): @contextmanager -def set_current_vllm_config(vllm_config: VllmConfig, - check_compile=False, - prefix: Optional[str] = None): +def set_current_vllm_config( + vllm_config: VllmConfig, check_compile=False, prefix: Optional[str] = None +): """ Temporarily set the current vLLM config. Used during model initialization. @@ -950,6 +1025,7 @@ def set_current_vllm_config(vllm_config: VllmConfig, old_vllm_config = _current_vllm_config old_prefix = _current_prefix from vllm.compilation.counter import compilation_counter + num_models_seen = compilation_counter.num_models_seen try: _current_vllm_config = vllm_config @@ -961,9 +1037,11 @@ def set_current_vllm_config(vllm_config: VllmConfig, if check_compile: vllm_config.compilation_config.custom_op_log_check() - if check_compile and \ - vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \ - and compilation_counter.num_models_seen == num_models_seen: + if ( + check_compile + and vllm_config.compilation_config.level == CompilationLevel.PIECEWISE + and compilation_counter.num_models_seen == num_models_seen + ): # If the model supports compilation, # compilation_counter.num_models_seen should be increased # by at least 1. @@ -973,7 +1051,8 @@ def set_current_vllm_config(vllm_config: VllmConfig, "`torch.compile` is turned on, but the model %s" " does not support it. Please open an issue on GitHub" " if you want it to be supported.", - vllm_config.model_config.model) + vllm_config.model_config.model, + ) finally: _current_vllm_config = old_vllm_config _current_prefix = old_prefix @@ -994,6 +1073,7 @@ def get_current_vllm_config() -> VllmConfig: # config. logger.warning("Current vLLM config is not set.") from vllm.config import VllmConfig + return VllmConfig() return _current_vllm_config @@ -1002,8 +1082,7 @@ def get_current_model_prefix() -> str: """ Get the prefix of the model that's currently being initialized. """ - assert _current_prefix is not None, \ - "Current model prefix is not set. " + assert _current_prefix is not None, "Current model prefix is not set. " return _current_prefix @@ -1011,9 +1090,10 @@ def get_current_model_prefix() -> str: def get_layers_from_vllm_config( - vllm_config: VllmConfig, - layer_type: type[T], - layer_names: Optional[list[str]] = None) -> dict[str, T]: + vllm_config: VllmConfig, + layer_type: type[T], + layer_names: Optional[list[str]] = None, +) -> dict[str, T]: """ Get layers from the vLLM config. @@ -1024,8 +1104,7 @@ def get_layers_from_vllm_config( """ if layer_names is None: - layer_names = list( - vllm_config.compilation_config.static_forward_context.keys()) + layer_names = list(vllm_config.compilation_config.static_forward_context.keys()) forward_context = vllm_config.compilation_config.static_forward_context @@ -1067,19 +1146,23 @@ def allow_audio_chunking(self) -> bool: return self.min_energy_split_window_size is not None -def update_config(config: DataclassInstanceT, - overrides: dict[str, Any]) -> DataclassInstanceT: +def update_config( + config: DataclassInstanceT, overrides: dict[str, Any] +) -> DataclassInstanceT: processed_overrides = {} for field_name, value in overrides.items(): - assert hasattr( - config, field_name), f"{type(config)} has no field `{field_name}`" + assert hasattr(config, field_name), ( + f"{type(config)} has no field `{field_name}`" + ) current_value = getattr(config, field_name) if is_dataclass(current_value) and not is_dataclass(value): assert isinstance(value, dict), ( f"Overrides to {type(config)}.{field_name} must be a dict" - f" or {type(current_value)}, but got {type(value)}") + f" or {type(current_value)}, but got {type(value)}" + ) value = update_config( current_value, # type: ignore[type-var] - value) + value, + ) processed_overrides[field_name] = value return replace(config, **processed_overrides) diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 8e31edbe2b6e..51644cacf6d1 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -103,8 +103,7 @@ def compute_hash(self) -> str: # no factors to consider. # this config will not affect the computation graph. factors: list[Any] = [] - hash_str = hashlib.md5(str(factors).encode(), - usedforsecurity=False).hexdigest() + hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str def get_limit_per_prompt(self, modality: str) -> int: @@ -139,6 +138,7 @@ def merge_mm_processor_kwargs( if init_device != inference_device: raise ValueError( "You cannot override the device for multi-modal preprocessing " - f"at runtime! Found: {init_device=} vs. {inference_device=}") + f"at runtime! Found: {init_device=} vs. {inference_device=}" + ) return kwargs | dict(inference_kwargs) diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 2eb633cbac1f..cc78c27d1780 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -161,9 +161,9 @@ class ParallelConfig: placement_group: Optional[PlacementGroup] = None """ray distributed model workers placement group.""" - distributed_executor_backend: Optional[Union[str, - DistributedExecutorBackend, - type[ExecutorBase]]] = None + distributed_executor_backend: Optional[ + Union[str, DistributedExecutorBackend, type[ExecutorBase]] + ] = None """Backend to use for distributed model workers, either "ray" or "mp" (multiprocessing). If the product of pipeline_parallel_size and tensor_parallel_size is less than @@ -262,7 +262,8 @@ def stateless_init_dp_group(self) -> ProcessGroup: from torch.distributed import DistNetworkError from vllm.distributed.utils import ( - stateless_init_torch_distributed_process_group) + stateless_init_torch_distributed_process_group, + ) max_retries = 5 last_exc: Optional[Exception] = None @@ -274,12 +275,12 @@ def stateless_init_dp_group(self) -> ProcessGroup: self.get_next_dp_init_port(), self.data_parallel_rank, self.data_parallel_size, - backend="gloo") + backend="gloo", + ) except DistNetworkError as e: # We only want to retry when the root cause is EADDRINUSE. if "EADDRINUSE" in str(e): - logger.warning( - "Address already in use. Retrying with a new port.") + logger.warning("Address already in use. Retrying with a new port.") last_exc = e continue # try again with a new port raise e @@ -289,11 +290,8 @@ def stateless_init_dp_group(self) -> ProcessGroup: raise last_exc @staticmethod - def has_unfinished_dp(dp_group: ProcessGroup, - has_unfinished: bool) -> bool: - tensor = torch.tensor([has_unfinished], - dtype=torch.int32, - device="cpu") + def has_unfinished_dp(dp_group: ProcessGroup, has_unfinished: bool) -> bool: + tensor = torch.tensor([has_unfinished], dtype=torch.int32, device="cpu") # dp rank 0: has_unfinished_seqs=True # dp rank 1: has_unfinished_seqs=False # aggregated: has_unfinished_seqs=True @@ -303,13 +301,10 @@ def has_unfinished_dp(dp_group: ProcessGroup, return aggregated_has_unfinished @staticmethod - def sync_kv_cache_memory_size(dp_group: ProcessGroup, - kv_cache_memory: int) -> int: + def sync_kv_cache_memory_size(dp_group: ProcessGroup, kv_cache_memory: int) -> int: if kv_cache_memory == -1: kv_cache_memory = torch.iinfo(torch.int64).max - tensor = torch.tensor([kv_cache_memory], - dtype=torch.int64, - device="cpu") + tensor = torch.tensor([kv_cache_memory], dtype=torch.int64, device="cpu") # we cannot use broadcast for stateless dp group since it depends # on global rank torch.distributed.all_reduce(tensor, op=ReduceOp.MIN, group=dp_group) @@ -334,38 +329,40 @@ def compute_hash(self): def __post_init__(self) -> None: # Forward deprecated fields to their new location if self.num_redundant_experts is not None: - self.eplb_config.num_redundant_experts = ( - self.num_redundant_experts) + self.eplb_config.num_redundant_experts = self.num_redundant_experts logger.warning_once( "num_redundant_experts is deprecated and has been replaced " "with eplb_config.num_redundant_experts. This will be removed " "in v0.12.0. Changing this field after initialization will " - "have no effect.") + "have no effect." + ) if self.eplb_window_size is not None: self.eplb_config.window_size = self.eplb_window_size logger.warning_once( "eplb_window_size is deprecated and has been replaced " "with eplb_config.window_size. This will be removed " "in v0.12.0. Changing this field after initialization will " - "have no effect.") + "have no effect." + ) if self.eplb_step_interval is not None: self.eplb_config.step_interval = self.eplb_step_interval logger.warning_once( "eplb_step_interval is deprecated and has been replaced " "with eplb_config.step_interval. This will be removed " "in v0.12.0. Changing this field after initialization will " - "have no effect.") + "have no effect." + ) if self.eplb_log_balancedness is not None: self.eplb_config.log_balancedness = self.eplb_log_balancedness logger.warning_once( "eplb_log_balancedness is deprecated and has been replaced " "with eplb_config.log_balancedness. This will be removed " "in v0.12.0. Changing this field after initialization will " - "have no effect.") + "have no effect." + ) # Continue with the rest of the initialization - self.world_size = self.pipeline_parallel_size * \ - self.tensor_parallel_size + self.world_size = self.pipeline_parallel_size * self.tensor_parallel_size if self.distributed_executor_backend == "external_launcher": logger.info("Using external launcher for distributed inference.") @@ -374,26 +371,30 @@ def __post_init__(self) -> None: if self.data_parallel_size_local > self.data_parallel_size: raise ValueError( f"data_parallel_size_local ({self.data_parallel_size_local}) " - f"must be <= data_parallel_size ({self.data_parallel_size})") + f"must be <= data_parallel_size ({self.data_parallel_size})" + ) if self.data_parallel_size > 1 or self.data_parallel_size_local == 0: # Data parallel was specified in the engine args. if self.distributed_executor_backend == "external_launcher": # For external launcher, # we need to set the data parallel rank automatically - self.data_parallel_rank = int(os.environ["RANK"]) \ - // (self.world_size // self.data_parallel_size) - logger.info("Set data_parallel_rank to %d automatically.", - self.data_parallel_rank) + self.data_parallel_rank = int(os.environ["RANK"]) // ( + self.world_size // self.data_parallel_size + ) + logger.info( + "Set data_parallel_rank to %d automatically.", + self.data_parallel_rank, + ) if not self._data_parallel_master_port_list: self._data_parallel_master_port_list = get_open_ports_list(5) - self.data_parallel_master_port = \ - self._data_parallel_master_port_list.pop() + self.data_parallel_master_port = self._data_parallel_master_port_list.pop() if not (0 <= self.data_parallel_rank < self.data_parallel_size): raise ValueError( f"data_parallel_rank ({self.data_parallel_rank})" - f" must be in the range [0, {self.data_parallel_size})") + f" must be in the range [0, {self.data_parallel_size})" + ) else: # Otherwise fall back to env vars (e.g. for offline SPMD case). self.data_parallel_size = envs.VLLM_DP_SIZE @@ -403,8 +404,10 @@ def __post_init__(self) -> None: self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT if self.data_parallel_external_lb: - raise ValueError("data_parallel_external_lb can only " - "be set when data_parallel_size > 1") + raise ValueError( + "data_parallel_external_lb can only " + "be set when data_parallel_size > 1" + ) if self.distributed_executor_backend == "external_launcher": os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" @@ -414,14 +417,15 @@ def __post_init__(self) -> None: if not current_platform.is_cuda(): raise ValueError( "Expert parallelism load balancing is only supported on " - "CUDA devices now.") + "CUDA devices now." + ) if self.eplb_config.num_redundant_experts < 0: raise ValueError( "num_redundant_experts must be non-negative, but got " - f"{self.eplb_config.num_redundant_experts}.") + f"{self.eplb_config.num_redundant_experts}." + ) if not self.enable_expert_parallel: - raise ValueError( - "enable_expert_parallel must be True to use EPLB.") + raise ValueError("enable_expert_parallel must be True to use EPLB.") if self.tensor_parallel_size * self.data_parallel_size <= 1: raise ValueError( "EPLB requires tensor_parallel_size or data_parallel_size " @@ -434,41 +438,50 @@ def __post_init__(self) -> None: "num_redundant_experts is set to " f"{self.eplb_config.num_redundant_experts} but EPLB is not " "enabled. Either enable EPLB or unset " - "num_redundant_experts.") + "num_redundant_experts." + ) if self.distributed_executor_backend is None and self.world_size > 1: # We use multiprocessing by default if world_size fits on the # current node and we aren't in a ray placement group. from vllm.executor import ray_utils + backend: DistributedExecutorBackend = "mp" ray_found = ray_utils.ray_is_available() if current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD: backend = "uni" - elif (current_platform.is_cuda() - and cuda_device_count_stateless() < self.world_size): + elif ( + current_platform.is_cuda() + and cuda_device_count_stateless() < self.world_size + ): if not ray_found: - raise ValueError("Unable to load Ray: " - f"{ray_utils.ray_import_err}. Ray is " - "required for multi-node inference, " - "please install Ray with `pip install " - "ray`.") + raise ValueError( + "Unable to load Ray: " + f"{ray_utils.ray_import_err}. Ray is " + "required for multi-node inference, " + "please install Ray with `pip install " + "ray`." + ) backend = "ray" elif self.data_parallel_backend == "ray": - logger.info("Using ray distributed inference because " - "data_parallel_backend is ray") + logger.info( + "Using ray distributed inference because " + "data_parallel_backend is ray" + ) backend = "ray" elif ray_found: if self.placement_group: backend = "ray" else: from ray import is_initialized as ray_is_initialized + if ray_is_initialized(): from ray.util import get_current_placement_group + if get_current_placement_group(): backend = "ray" self.distributed_executor_backend = backend - logger.debug("Defaulting to use %s for distributed inference", - backend) + logger.debug("Defaulting to use %s for distributed inference", backend) if self.distributed_executor_backend is None and self.world_size == 1: self.distributed_executor_backend = "uni" @@ -477,39 +490,50 @@ def __post_init__(self) -> None: raise ValueError( "Invalid value of `_api_process_rank`. " f"Expected to be `-1` or `[0, {self._api_process_count})`, " - f"but found: {self._api_process_rank}") + f"but found: {self._api_process_rank}" + ) @property def use_ray(self) -> bool: return self.distributed_executor_backend == "ray" or ( isinstance(self.distributed_executor_backend, type) - and getattr(self.distributed_executor_backend, "uses_ray", False)) + and getattr(self.distributed_executor_backend, "uses_ray", False) + ) - @model_validator(mode='after') + @model_validator(mode="after") def _verify_args(self) -> Self: # Lazy import to avoid circular import from vllm.executor.executor_base import ExecutorBase from vllm.platforms import current_platform - if self.distributed_executor_backend is not None and not isinstance( - self.distributed_executor_backend, str) and not (isinstance( - self.distributed_executor_backend, type) and issubclass( - self.distributed_executor_backend, ExecutorBase)): + + if ( + self.distributed_executor_backend is not None + and not isinstance(self.distributed_executor_backend, str) + and not ( + isinstance(self.distributed_executor_backend, type) + and issubclass(self.distributed_executor_backend, ExecutorBase) + ) + ): raise ValueError( "Unrecognized distributed executor backend " f"{self.distributed_executor_backend}. Supported " "values are 'ray', 'mp' 'uni', 'external_launcher', " - " custom ExecutorBase subclass or its import path.") + " custom ExecutorBase subclass or its import path." + ) if self.use_ray: from vllm.executor import ray_utils + ray_utils.assert_ray_available() if not current_platform.use_custom_allreduce(): self.disable_custom_all_reduce = True logger.debug( "Disabled the custom all-reduce kernel because it is not " - "supported on current platform.") + "supported on current platform." + ) if self.ray_workers_use_nsight and not self.use_ray: - raise ValueError("Unable to use nsight profiling unless workers " - "run with Ray.") + raise ValueError( + "Unable to use nsight profiling unless workers run with Ray." + ) return self diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 70f08a6b927c..d8fd931ed5a2 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -3,13 +3,21 @@ import time from abc import ABC, abstractmethod from collections import defaultdict -from collections.abc import (Callable, Generator, ItemsView, Iterable, Mapping, - Sequence) +from collections.abc import Callable, Generator, ItemsView, Iterable, Mapping, Sequence from dataclasses import dataclass, field, replace from enum import Enum from functools import lru_cache -from typing import (TYPE_CHECKING, Any, Generic, NamedTuple, Optional, - Protocol, Union, cast, overload) +from typing import ( + TYPE_CHECKING, + Any, + Generic, + NamedTuple, + Optional, + Protocol, + Union, + cast, + overload, +) import regex as re import torch @@ -17,20 +25,28 @@ from vllm.logger import init_logger from vllm.transformers_utils.processor import cached_processor_from_config -from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens, - encode_tokens) -from vllm.utils import (flatten_2d_lists, full_groupby, - get_allowed_kwarg_only_overrides) +from vllm.transformers_utils.tokenizer import AnyTokenizer, decode_tokens, encode_tokens +from vllm.utils import flatten_2d_lists, full_groupby, get_allowed_kwarg_only_overrides from vllm.utils.jsontree import JSONTree, json_map_leaves from .hasher import MultiModalHasher -from .inputs import (MultiModalDataDict, MultiModalEncDecInputs, - MultiModalFieldConfig, MultiModalInputs, - MultiModalKwargsItem, MultiModalKwargsItems, - MultiModalKwargsOptionalItems, MultiModalUUIDDict, - PlaceholderRange) -from .parse import (DictEmbeddingItems, EmbeddingItems, MultiModalDataItems, - MultiModalDataParser) +from .inputs import ( + MultiModalDataDict, + MultiModalEncDecInputs, + MultiModalFieldConfig, + MultiModalInputs, + MultiModalKwargsItem, + MultiModalKwargsItems, + MultiModalKwargsOptionalItems, + MultiModalUUIDDict, + PlaceholderRange, +) +from .parse import ( + DictEmbeddingItems, + EmbeddingItems, + MultiModalDataItems, + MultiModalDataParser, +) if TYPE_CHECKING: from transformers.configuration_utils import PretrainedConfig @@ -57,9 +73,7 @@ def _cached_encode( *, add_special_tokens: Optional[bool] = None, ) -> list[int]: - return encode_tokens(tokenizer, - text, - add_special_tokens=add_special_tokens) + return encode_tokens(tokenizer, text, add_special_tokens=add_special_tokens) @lru_cache(maxsize=2048) @@ -69,9 +83,9 @@ def _cached_decode( *, skip_special_tokens: Optional[bool] = None, ) -> str: - return decode_tokens(tokenizer, - list(token_ids), - skip_special_tokens=skip_special_tokens) + return decode_tokens( + tokenizer, list(token_ids), skip_special_tokens=skip_special_tokens + ) def _seq2text(tokenizer: AnyTokenizer, seq: PromptSeq) -> str: @@ -89,24 +103,22 @@ def _seq2tokens(tokenizer: AnyTokenizer, seq: PromptSeq) -> list[int]: class _GetMatchIndex(Protocol): - def __call__( self, tokenizer: AnyTokenizer, prompt: PromptSeq, start_idx: int = 0, - ) -> Optional[int]: - ... + ) -> Optional[int]: ... @dataclass class PromptIndex: """Resolves to an index in the prompt.""" + get_match_index: _GetMatchIndex class PromptIndexTargets: - @staticmethod def start() -> PromptIndex: """ @@ -139,9 +151,7 @@ def get_match_index( else: if isinstance(prefix, str): # Make both `list[int]` - prefix = encode_tokens(tokenizer, - prefix, - add_special_tokens=False) + prefix = encode_tokens(tokenizer, prefix, add_special_tokens=False) match_idx = len(prefix) return match_idx if prompt[:match_idx] == prefix else None @@ -181,8 +191,7 @@ class PromptUpdateDetails(Generic[_S]): full: _S """The full content.""" - is_embed: Optional[Callable[[AnyTokenizer, PromptSeq], - torch.Tensor]] = None + is_embed: Optional[Callable[[AnyTokenizer, PromptSeq], torch.Tensor]] = None """ Given [`full`][vllm.multimodal.processing.PromptUpdateDetails.full], return a boolean mask of shape `(len(full),)` indicating which positions @@ -203,7 +212,6 @@ def select_text( seq: _S, embed_text: str, ) -> "PromptUpdateDetails[_S]": - def is_embed(tokenizer: AnyTokenizer, full: PromptSeq) -> torch.Tensor: embed_token_ids = encode_tokens(tokenizer, embed_text) token_ids = _seq2tokens(tokenizer, full) @@ -220,7 +228,6 @@ def select_token_id( seq: _S, embed_token_id: int, ) -> "PromptUpdateDetails[_S]": - def is_embed(tokenizer: AnyTokenizer, full: PromptSeq) -> torch.Tensor: token_ids = _seq2tokens(tokenizer, full) @@ -238,8 +245,7 @@ def is_embed(tokenizer: AnyTokenizer, full: PromptSeq) -> torch.Tensor: specify which part. """ -PromptUpdateContent = Union[Callable[[int], PromptUpdateInfo], - PromptUpdateInfo] +PromptUpdateContent = Union[Callable[[int], PromptUpdateInfo], PromptUpdateInfo] """ Given the index of the processed item within [`modality`][vllm.multimodal.processing.PromptUpdate.modality], @@ -408,11 +414,13 @@ class PromptReplacement(PromptUpdate): modality="image", target="", replacement=PromptUpdateDetails( - full="".join([ - "", - "" * image_feature_size, - "", - ]), + full="".join( + [ + "", + "" * image_feature_size, + "", + ] + ), features="" * image_feature_size, ), ) @@ -426,8 +434,9 @@ class PromptReplacement(PromptUpdate): modality="image", target=[image_token_id], replacement=PromptUpdateDetails( - full=([image_bos_id] + [image_token_id] * image_feature_size - + [image_eos_id]), + full=( + [image_bos_id] + [image_token_id] * image_feature_size + [image_eos_id] + ), features=[image_token_id] * image_feature_size, ), ) @@ -459,10 +468,8 @@ class _HasModalityAttr(Protocol): class _HasModalityProp(Protocol): - @property - def modality(self) -> str: - ... + def modality(self) -> str: ... _M = TypeVar("_M", bound=Union[_HasModalityAttr, _HasModalityProp]) @@ -520,9 +527,7 @@ def iter_token_matches( target_token_ids = _seq2tokens(tokenizer, target) - for match in iter_token_matches(prompt, - target_token_ids, - start_idx=start_idx): + for match in iter_token_matches(prompt, target_token_ids, start_idx=start_idx): yield PromptTargetMatch(match.start_idx, match.end_idx) def iter_text_matches( @@ -544,8 +549,7 @@ def iter_text_matches( target_text = _seq2text(tokenizer, target) - for match in re.finditer(re.escape(target_text), prompt, - pos=start_idx): + for match in re.finditer(re.escape(target_text), prompt, pos=start_idx): yield PromptTargetMatch(match.start(), match.end()) def iter_matches( @@ -557,9 +561,7 @@ def iter_matches( ) -> Generator[PromptTargetMatch]: """Yield each instance of `self.target` found in `prompt`.""" if isinstance(prompt, str): - return self.iter_text_matches(prompt, - tokenizer, - start_idx=start_idx) + return self.iter_text_matches(prompt, tokenizer, start_idx=start_idx) return self.iter_token_matches(prompt, tokenizer, start_idx=start_idx) @@ -680,9 +682,9 @@ def _find_matches( break # Already found a match for this item for match in update.iter_matches( - prompt, - tokenizer, - start_idx=prev_end_idx, + prompt, + tokenizer, + start_idx=prev_end_idx, ): # All matches should share the same mode if mode is None: @@ -723,8 +725,7 @@ def _apply_matches( out_seqs = list[Union[str, list[int]]]() out_result: MultiModalPromptUpdatesApplyResult = { - m: [None] * len(items) - for m, items in mm_prompt_updates.items() + m: [None] * len(items) for m, items in mm_prompt_updates.items() } start_idx = prev_end_idx = 0 @@ -743,8 +744,7 @@ def _apply_matches( for (modality, item_idx), (match, update_idx) in matches_to_apply: found = True - matched_update = mm_prompt_updates[modality][item_idx][ - update_idx] + matched_update = mm_prompt_updates[modality][item_idx][update_idx] matched_content = matched_update.content.full if mode == UpdateMode.INSERT: @@ -756,9 +756,10 @@ def _apply_matches( out_seqs.append(prompt[prev_end_idx:end_idx_to_insert]) out_seqs.append( - _seq2text(tokenizer, matched_content - ) if isinstance(prompt, str) else _seq2tokens( - tokenizer, matched_content)) + _seq2text(tokenizer, matched_content) + if isinstance(prompt, str) + else _seq2tokens(tokenizer, matched_content) + ) out_result[modality][item_idx] = update_idx # Exclude overlapping matches @@ -784,8 +785,7 @@ def apply_token_matches( the same placeholder tokens. In that case, the modality that appears earlier in `mm_prompt_updates` takes priority. """ - token_id_seqs, result = _apply_matches(prompt, mm_prompt_updates, - tokenizer) + token_id_seqs, result = _apply_matches(prompt, mm_prompt_updates, tokenizer) return flatten_2d_lists(token_id_seqs), result @@ -847,8 +847,7 @@ def _iter_placeholders( if prompt[start_idx:end_idx_full] == content_tokens_full: content_is_embed = content.is_embed if content_is_embed is not None: - content_is_embed = content_is_embed( - tokenizer, content.full) + content_is_embed = content_is_embed(tokenizer, content.full) yield PlaceholderFeaturesInfo( modality=modality, @@ -899,16 +898,14 @@ class InputProcessingContext: """The tokenizer used to tokenize the inputs.""" @overload - def get_hf_config(self, /) -> "PretrainedConfig": - ... + def get_hf_config(self, /) -> "PretrainedConfig": ... @overload def get_hf_config( self, typ: Union[type[_C], tuple[type[_C], ...]], /, - ) -> _C: - ... + ) -> _C: ... def get_hf_config( self, @@ -930,9 +927,11 @@ def get_hf_config( hf_config = self.model_config.hf_config if not isinstance(hf_config, typ): - raise TypeError("Invalid type of HuggingFace config. " - f"Expected type: {typ}, but " - f"found type: {type(hf_config)}") + raise TypeError( + "Invalid type of HuggingFace config. " + f"Expected type: {typ}, but " + f"found type: {type(hf_config)}" + ) return hf_config @@ -956,8 +955,7 @@ def get_mm_config(self): return mm_config @overload - def get_hf_processor(self, /, **kwargs: object) -> "ProcessorMixin": - ... + def get_hf_processor(self, /, **kwargs: object) -> "ProcessorMixin": ... @overload def get_hf_processor( @@ -965,8 +963,7 @@ def get_hf_processor( typ: Union[type[_P], tuple[type[_P], ...]], /, **kwargs: object, - ) -> _P: - ... + ) -> _P: ... def get_hf_processor( self, @@ -1041,6 +1038,7 @@ def _postprocess_one(x: object): # Async GPU -> CPU requires explicit synchronization if is_mm_processing_gpu: from vllm.platforms import current_platform + synchronize = current_platform.synchronize if synchronize is not None: synchronize() @@ -1073,17 +1071,21 @@ def call_hf_processor( ) try: - output = hf_processor(**data, - **allowed_kwargs, - return_tensors="pt") + output = hf_processor(**data, **allowed_kwargs, return_tensors="pt") except Exception as exc: # See https://github.com/huggingface/tokenizers/issues/537 - if (isinstance(exc, RuntimeError) and exc - and exc.args[0] == "Already borrowed" - and num_tries < max_tries): + if ( + isinstance(exc, RuntimeError) + and exc + and exc.args[0] == "Already borrowed" + and num_tries < max_tries + ): logger.warning( "Failed to acquire tokenizer in current thread. " - "Retrying (%d/%d)...", num_tries, max_tries) + "Retrying (%d/%d)...", + num_tries, + max_tries, + ) time.sleep(0.5) return self.call_hf_processor( hf_processor, @@ -1093,8 +1095,10 @@ def call_hf_processor( max_tries=max_tries, ) - msg = (f"Failed to apply {type(hf_processor).__name__} " - f"on data={data} with kwargs={allowed_kwargs}") + msg = ( + f"Failed to apply {type(hf_processor).__name__} " + f"on data={data} with kwargs={allowed_kwargs}" + ) raise ValueError(msg) from exc @@ -1161,8 +1165,11 @@ def get_allowed_mm_limits(self) -> Mapping[str, int]: for modality, supported_limit in supported_mm_limits.items(): user_limit = mm_config.get_limit_per_prompt(modality) - allowed_limits[modality] = (user_limit if supported_limit is None - else min(user_limit, supported_limit)) + allowed_limits[modality] = ( + user_limit + if supported_limit is None + else min(user_limit, supported_limit) + ) return allowed_limits @@ -1173,7 +1180,7 @@ def get_mm_max_tokens_per_item( ) -> Optional[Mapping[str, int]]: """ Return the maximum number of tokens per item of for each modality. - + When `None` (the default) is returned, vLLM will generate dummy inputs (images/videos) at maximum possible sizes and process them to determine the maximum token count per modality. @@ -1184,7 +1191,7 @@ def get_mm_max_tokens_per_item( counts, avoiding the need for dummy input generation and processing. Note: - The maximum number of tokens per item of each modality returned + The maximum number of tokens per item of each modality returned from this function should respect the model's maximum sequence length and the maximum number of items of each modality allowed, and agree with dummy inputs (images/videos) at maximum possible @@ -1264,10 +1271,7 @@ def __call__( *, mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalInputs: - return self.apply(prompt, - mm_data, - hf_processor_mm_kwargs, - mm_uuids=mm_uuids) + return self.apply(prompt, mm_data, hf_processor_mm_kwargs, mm_uuids=mm_uuids) def _get_data_parser(self) -> MultiModalDataParser: """ @@ -1295,8 +1299,7 @@ def validate_num_items( limit = min(supported_limit, allowed_limit) if num_items > limit: - msg = (f"At most {limit} {modality}(s) may be provided in " - "one prompt.") + msg = f"At most {limit} {modality}(s) may be provided in one prompt." if num_items <= supported_limit: msg += " Set `--limit-mm-per-prompt` to increase this limit." @@ -1358,8 +1361,10 @@ def _bind_and_group_updates( mm_item_counts: Mapping[str, int], ) -> MultiModalPromptUpdates: return { - modality: [[update.resolve(item_idx) for update in updates] - for item_idx in range(mm_item_counts.get(modality, 0))] + modality: [ + [update.resolve(item_idx) for update in updates] + for item_idx in range(mm_item_counts.get(modality, 0)) + ] for modality, updates in full_groupby_modality(prompt_updates) } @@ -1404,8 +1409,7 @@ def _find_mm_placeholders( ) -> Mapping[str, list[PlaceholderFeaturesInfo]]: tokenizer = self.info.get_tokenizer() - return find_mm_placeholders(new_token_ids, mm_prompt_updates, - tokenizer) + return find_mm_placeholders(new_token_ids, mm_prompt_updates, tokenizer) def _get_hf_mm_data( self, @@ -1455,7 +1459,8 @@ def _hf_processor_applies_updates( """ return not any( isinstance(items, (EmbeddingItems, DictEmbeddingItems)) - for items in mm_items.values()) + for items in mm_items.values() + ) def _apply_hf_processor_text_mm( self, @@ -1480,7 +1485,7 @@ def _apply_hf_processor_text_mm( ) processed_data.update(passthrough_data) - prompt_ids, = processed_data.pop("input_ids").tolist() + (prompt_ids,) = processed_data.pop("input_ids").tolist() is_update_applied = self._hf_processor_applies_updates( prompt_text=prompt_text, @@ -1583,8 +1588,7 @@ def _apply_hf_processor_main( tokenization_kwargs=tokenization_kwargs, ) - prompt_ids = self._apply_hf_processor_text_only( - prompt, tokenization_kwargs) + prompt_ids = self._apply_hf_processor_text_only(prompt, tokenization_kwargs) else: prompt_ids = self._apply_hf_processor_tokens_only(prompt) @@ -1630,10 +1634,11 @@ def _hash_mm_items( # hash if `hf_processor_mm_kwargs` or `tokenization_kwargs` # are provided. This is because the processed multimodal # inputs can be different depending on the processor kwargs. - if item_uuid is None or \ - hf_processor_mm_kwargs or \ - tokenization_kwargs: - + if ( + item_uuid is None + or hf_processor_mm_kwargs + or tokenization_kwargs + ): # NOTE: use provided hash string to hash with kwargs # if available for better performance. item = item_uuid if item_uuid is not None else item @@ -1642,16 +1647,20 @@ def _hash_mm_items( model_id=model_id, **{modality: item}, **hf_processor_mm_kwargs, - **tokenization_kwargs)) + **tokenization_kwargs, + ) + ) else: computed.append(item_uuid) hashes[modality] = computed else: hashes[modality] = [ - MultiModalHasher.hash_kwargs(model_id=model_id, - **{modality: item}, - **hf_processor_mm_kwargs, - **tokenization_kwargs) + MultiModalHasher.hash_kwargs( + model_id=model_id, + **{modality: item}, + **hf_processor_mm_kwargs, + **tokenization_kwargs, + ) for item in items ] @@ -1664,13 +1673,13 @@ def _get_cache_missing_items( mm_hashes: MultiModalHashes, ) -> MultiModalDataItems: mm_is_cached = { - modality: cache.is_cached(hashes) - for modality, hashes in mm_hashes.items() + modality: cache.is_cached(hashes) for modality, hashes in mm_hashes.items() } mm_missing_idxs = { modality: [ - idx for idx, item_is_cached in enumerate(items_is_cached) + idx + for idx, item_is_cached in enumerate(items_is_cached) if not item_is_cached ] for modality, items_is_cached in mm_is_cached.items() @@ -1683,7 +1692,8 @@ def _get_cache_missing_items( if data is None: raise ValueError( f"Cache miss for {modality} at index {idx} " - f"but data is not provided.") + f"but data is not provided." + ) else: missing_modality_data.append(data) mm_missing_data[modality] = missing_modality_data @@ -1711,20 +1721,18 @@ def _merge_mm_kwargs( # Need to calculate this at the beginning to avoid skipping cache logic # for subsequently repeated items in the same modality mm_is_cached = { - modality: cache.is_cached(hashes) - for modality, hashes in mm_hashes.items() + modality: cache.is_cached(hashes) for modality, hashes in mm_hashes.items() } mm_missing_next_idx = defaultdict[str, int](lambda: 0) - merged_kwargs = defaultdict[str, - list[Optional[MultiModalKwargsItem]]](list) - merged_prompt_updates = defaultdict[ - str, list[Sequence[ResolvedPromptUpdate]]](list) + merged_kwargs = defaultdict[str, list[Optional[MultiModalKwargsItem]]](list) + merged_prompt_updates = defaultdict[str, list[Sequence[ResolvedPromptUpdate]]]( + list + ) for modality, hashes in mm_hashes.items(): missing_kwargs = mm_missing_kwargs.get(modality, []) - missing_prompt_updates = mm_missing_prompt_updates.get( - modality, []) + missing_prompt_updates = mm_missing_prompt_updates.get(modality, []) for item_idx, item_hash in enumerate(hashes): kwargs: Optional[MultiModalKwargsItem] @@ -1742,10 +1750,12 @@ def _merge_mm_kwargs( kwargs, updates = cache.get_and_update_item(item, item_hash) merged_kwargs[modality].append(kwargs) - merged_prompt_updates[modality].append([ - self._recompute_cached_prompt_update(update, item_idx) - for update in updates - ]) + merged_prompt_updates[modality].append( + [ + self._recompute_cached_prompt_update(update, item_idx) + for update in updates + ] + ) mm_kwargs = MultiModalKwargsItems(merged_kwargs) mm_prompt_updates = dict(merged_prompt_updates) @@ -1775,15 +1785,16 @@ def _apply_hf_processor( mm_kwargs = MultiModalKwargsItems.from_hf_inputs( mm_processed_data, - self._get_mm_fields_config(mm_processed_data, - hf_processor_mm_kwargs), + self._get_mm_fields_config(mm_processed_data, hf_processor_mm_kwargs), ) # Use overrides if provided; fallback to data-dependent hashing. - mm_hashes = self._hash_mm_items(mm_data_items, - hf_processor_mm_kwargs, - tokenization_kwargs, - mm_uuids=mm_uuids) + mm_hashes = self._hash_mm_items( + mm_data_items, + hf_processor_mm_kwargs, + tokenization_kwargs, + mm_uuids=mm_uuids, + ) mm_prompt_updates = self._get_mm_prompt_updates( mm_data_items, @@ -1824,10 +1835,12 @@ def _cached_apply_hf_processor( mm_uuids=mm_uuids, ) - mm_hashes = self._hash_mm_items(mm_data_items, - hf_processor_mm_kwargs, - tokenization_kwargs, - mm_uuids=mm_uuids) + mm_hashes = self._hash_mm_items( + mm_data_items, + hf_processor_mm_kwargs, + tokenization_kwargs, + mm_uuids=mm_uuids, + ) mm_missing_data_items = self._get_cache_missing_items( cache=cache, @@ -1852,8 +1865,9 @@ def _cached_apply_hf_processor( mm_missing_kwargs = MultiModalKwargsItems.from_hf_inputs( mm_missing_processed_data, - self._get_mm_fields_config(mm_missing_processed_data, - hf_processor_mm_kwargs), + self._get_mm_fields_config( + mm_missing_processed_data, hf_processor_mm_kwargs + ), ) mm_missing_prompt_updates = self._get_mm_prompt_updates( @@ -1916,8 +1930,9 @@ def _apply_prompt_updates( # of the search text in the prompt, we instead perform string-based # updates on the decoded token IDs, then encode them back. if all( - all(update_idx is not None for update_idx in update_idxs) - for update_idxs in match_result.values()): + all(update_idx is not None for update_idx in update_idxs) + for update_idxs in match_result.values() + ): new_text = decode_tokens(tokenizer, new_token_ids) else: new_text, match_result = self._apply_text_matches( @@ -1931,16 +1946,17 @@ def _apply_prompt_updates( add_special_tokens=False, ) - matched_updates = defaultdict[ - str, list[Sequence[ResolvedPromptUpdate]]](list) + matched_updates = defaultdict[str, list[Sequence[ResolvedPromptUpdate]]](list) for modality, update_idxs in match_result.items(): for item_idx, update_idx in enumerate(update_idxs): assert update_idx is not None, ( "Failed to apply prompt replacement for " - f"mm_items[{modality!r}][{item_idx}]") + f"mm_items[{modality!r}][{item_idx}]" + ) matched_updates[modality].append( - [mm_prompt_updates[modality][item_idx][update_idx]]) + [mm_prompt_updates[modality][item_idx][update_idx]] + ) placeholders = self._find_mm_placeholders( new_token_ids, @@ -1965,7 +1981,8 @@ def _validate_mm_kwargs( "There is likely a problem with your " "implementation of merged multi-modal processor for this " "model (usually arising from an inconsistency between " - "`_call_hf_processor` and `_get_mm_fields_config`).") + "`_call_hf_processor` and `_get_mm_fields_config`)." + ) def _validate_mm_placeholders( self, @@ -1986,7 +2003,8 @@ def _validate_mm_placeholders( "This is likely because you forgot to include input " "placeholder tokens (e.g., ``, `<|image_pad|>`) " "in the prompt. If the model has a chat template, make " - "sure you have applied it before calling `LLM.generate`.") + "sure you have applied it before calling `LLM.generate`." + ) def _maybe_apply_prompt_updates( self, @@ -2085,7 +2103,6 @@ def apply( class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]): - @abstractmethod def create_encoder_prompt( self, @@ -2119,9 +2136,9 @@ def _get_enc_dec_inputs( tokenizer = self.info.get_tokenizer() decoder_prompt = self.create_decoder_prompt(prompt, mm_data) if isinstance(decoder_prompt, str): - decoder_prompt_ids = encode_tokens(tokenizer, - decoder_prompt, - add_special_tokens=False) + decoder_prompt_ids = encode_tokens( + tokenizer, decoder_prompt, add_special_tokens=False + ) else: decoder_prompt_ids = decoder_prompt decoder_prompt = decode_tokens(tokenizer, decoder_prompt) @@ -2129,11 +2146,11 @@ def _get_enc_dec_inputs( mm_inputs = MultiModalEncDecInputs( encoder_prompt=encoder_inputs["prompt"], encoder_prompt_token_ids=encoder_inputs["prompt_token_ids"], - **encoder_inputs) - mm_inputs.update({ - "prompt": decoder_prompt, - "prompt_token_ids": decoder_prompt_ids - }) + **encoder_inputs, + ) + mm_inputs.update( + {"prompt": decoder_prompt, "prompt_token_ids": decoder_prompt_ids} + ) return mm_inputs def apply( diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index c41d21b5c6df..7921f1ea69bb 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -28,8 +28,12 @@ _M = TypeVar("_M") if TYPE_CHECKING: - from .inputs import (BatchedTensorInputs, MultiModalKwargsItem, - MultiModalKwargsItems, MultiModalPlaceholderDict) + from .inputs import ( + BatchedTensorInputs, + MultiModalKwargsItem, + MultiModalKwargsItems, + MultiModalPlaceholderDict, + ) else: BatchedTensorInputs = Any MultiModalKwargsItem = Any @@ -37,12 +41,12 @@ MultiModalPlaceholderDict = Any global_thread_pool = ThreadPoolExecutor( - max_workers=envs.VLLM_MEDIA_LOADING_THREAD_COUNT) + max_workers=envs.VLLM_MEDIA_LOADING_THREAD_COUNT +) atexit.register(global_thread_pool.shutdown) class MediaConnector: - def __init__( self, media_io_kwargs: Optional[dict[str, dict[str, Any]]] = None, @@ -52,9 +56,9 @@ def __init__( ) -> None: """ Args: - media_io_kwargs: Additional args passed to process media - inputs, keyed by modalities. For example, - to set num_frames for video, set + media_io_kwargs: Additional args passed to process media + inputs, keyed by modalities. For example, + to set num_frames for video, set `--media-io-kwargs '{"video":{"num_frames":40}}'` connection: HTTP connection client to download media contents. allowed_local_media_path: A local directory to load media files @@ -62,8 +66,9 @@ def __init__( """ super().__init__() - self.media_io_kwargs: dict[str, dict[ - str, Any]] = media_io_kwargs if media_io_kwargs else {} + self.media_io_kwargs: dict[str, dict[str, Any]] = ( + media_io_kwargs if media_io_kwargs else {} + ) self.connection = connection if allowed_local_media_path: @@ -72,11 +77,13 @@ def __init__( if not allowed_local_media_path_.exists(): raise ValueError( "Invalid `--allowed-local-media-path`: The path " - f"{allowed_local_media_path_} does not exist.") + f"{allowed_local_media_path_} does not exist." + ) if not allowed_local_media_path_.is_dir(): raise ValueError( "Invalid `--allowed-local-media-path`: The path " - f"{allowed_local_media_path_} must be a directory.") + f"{allowed_local_media_path_} must be a directory." + ) else: allowed_local_media_path_ = None @@ -103,14 +110,16 @@ def _load_file_url( ) -> _M: # type: ignore[type-var] allowed_local_media_path = self.allowed_local_media_path if allowed_local_media_path is None: - raise RuntimeError("Cannot load local files without " - "`--allowed-local-media-path`.") + raise RuntimeError( + "Cannot load local files without `--allowed-local-media-path`." + ) filepath = Path(url2pathname(url_spec.path)) if allowed_local_media_path not in filepath.resolve().parents: raise ValueError( f"The file path {filepath} must be a subpath " - f"of `--allowed-local-media-path` {allowed_local_media_path}.") + f"of `--allowed-local-media-path` {allowed_local_media_path}." + ) return media_io.load_file(filepath) @@ -151,20 +160,19 @@ async def load_from_url_async( if url_spec.scheme.startswith("http"): connection = self.connection data = await connection.async_get_bytes(url, timeout=fetch_timeout) - future = loop.run_in_executor(global_thread_pool, - media_io.load_bytes, data) + future = loop.run_in_executor(global_thread_pool, media_io.load_bytes, data) return await future if url_spec.scheme == "data": - future = loop.run_in_executor(global_thread_pool, - self._load_data_url, url_spec, - media_io) + future = loop.run_in_executor( + global_thread_pool, self._load_data_url, url_spec, media_io + ) return await future if url_spec.scheme == "file": - future = loop.run_in_executor(global_thread_pool, - self._load_file_url, url_spec, - media_io) + future = loop.run_in_executor( + global_thread_pool, self._load_file_url, url_spec, media_io + ) return await future msg = "The URL must be either a HTTP, data or file URL." raise ValueError(msg) @@ -210,8 +218,9 @@ def fetch_image( By default, the image is converted into RGB format. """ - image_io = ImageMediaIO(image_mode=image_mode, - **self.media_io_kwargs.get("image", {})) + image_io = ImageMediaIO( + image_mode=image_mode, **self.media_io_kwargs.get("image", {}) + ) try: return self.load_from_url( @@ -234,8 +243,9 @@ async def fetch_image_async( By default, the image is converted into RGB format. """ - image_io = ImageMediaIO(image_mode=image_mode, - **self.media_io_kwargs.get("image", {})) + image_io = ImageMediaIO( + image_mode=image_mode, **self.media_io_kwargs.get("image", {}) + ) try: return await self.load_from_url_async( @@ -256,10 +266,10 @@ def fetch_video( """ Load video from an HTTP or base64 data URL. """ - image_io = ImageMediaIO(image_mode=image_mode, - **self.media_io_kwargs.get("image", {})) - video_io = VideoMediaIO(image_io, - **self.media_io_kwargs.get("video", {})) + image_io = ImageMediaIO( + image_mode=image_mode, **self.media_io_kwargs.get("image", {}) + ) + video_io = VideoMediaIO(image_io, **self.media_io_kwargs.get("video", {})) return self.load_from_url( video_url, @@ -278,10 +288,10 @@ async def fetch_video_async( By default, the image is converted into RGB format. """ - image_io = ImageMediaIO(image_mode=image_mode, - **self.media_io_kwargs.get("image", {})) - video_io = VideoMediaIO(image_io, - **self.media_io_kwargs.get("video", {})) + image_io = ImageMediaIO( + image_mode=image_mode, **self.media_io_kwargs.get("image", {}) + ) + video_io = VideoMediaIO(image_io, **self.media_io_kwargs.get("video", {})) return await self.load_from_url_async( video_url, @@ -362,14 +372,15 @@ def allocate_gpu_mm_processors( ] else: # Already targeted a specific GPU - (device_idx, ) = map(int, rest) + (device_idx,) = map(int, rest) processor_gpu_idxs = [device_idx] * mm_processor_count return [f"{device_type}:{gpu_idx}" for gpu_idx in processor_gpu_idxs] def argsort_mm_positions( - mm_positions: MultiModalPlaceholderDict) -> list[tuple[str, int]]: + mm_positions: MultiModalPlaceholderDict, +) -> list[tuple[str, int]]: """ Given a `MultiModalPlaceholderDict`, output a sequence of keys to sort the dictionary by `offset` (starting index in the input sequence) @@ -379,9 +390,11 @@ def argsort_mm_positions( A list of `(modality, idx)`, which can be used to access an item by `mm_positions[modality][idx]`. """ - flat_items = ((modality, idx, item) - for modality, items in mm_positions.items() - for idx, item in enumerate(items)) + flat_items = ( + (modality, idx, item) + for modality, items in mm_positions.items() + for idx, item in enumerate(items) + ) sorted_flat_items = sorted(flat_items, key=lambda x: x[2].offset) @@ -389,17 +402,18 @@ def argsort_mm_positions( # Temporary back-compatibility for plugins that define model runner -@deprecated("`group_mm_inputs_by_modality` is superseded by " - "`group_mm_kwargs_by_modality` and will be removed in v0.13. " - "Please use `group_mm_kwargs_by_modality` instead.") +@deprecated( + "`group_mm_inputs_by_modality` is superseded by " + "`group_mm_kwargs_by_modality` and will be removed in v0.13. " + "Please use `group_mm_kwargs_by_modality` instead." +) def group_mm_inputs_by_modality( - mm_inputs: list[MultiModalKwargsItems] + mm_inputs: list[MultiModalKwargsItems], ) -> list[list[MultiModalKwargsItems]]: if not mm_inputs: return [] - def modality_group_func( - mm_input: MultiModalKwargsItems) -> Union[str, int]: + def modality_group_func(mm_input: MultiModalKwargsItems) -> Union[str, int]: # If the input has multiple modalities, return an id as the unique key # for the mm_input input. if len(mm_input) > 1: @@ -410,9 +424,7 @@ def modality_group_func( raise AssertionError("This line should be unreachable.") - return [ - list(group) for _, group in groupby(mm_inputs, key=modality_group_func) - ] + return [list(group) for _, group in groupby(mm_inputs, key=modality_group_func)] def group_mm_kwargs_by_modality( @@ -473,9 +485,7 @@ def fetch_audio( audio_url: URL of the audio file to fetch. audio_io_kwargs: Additional kwargs passed to handle audio IO. """ - media_io_kwargs = None if not audio_io_kwargs else { - "audio": audio_io_kwargs - } + media_io_kwargs = None if not audio_io_kwargs else {"audio": audio_io_kwargs} media_connector = MediaConnector(media_io_kwargs=media_io_kwargs) return media_connector.fetch_audio(audio_url) @@ -489,9 +499,7 @@ def fetch_image( image_url: URL of the image file to fetch. image_io_kwargs: Additional kwargs passed to handle image IO. """ - media_io_kwargs = None if not image_io_kwargs else { - "image": image_io_kwargs - } + media_io_kwargs = None if not image_io_kwargs else {"image": image_io_kwargs} media_connector = MediaConnector(media_io_kwargs=media_io_kwargs) return media_connector.fetch_image(image_url) @@ -505,8 +513,6 @@ def fetch_video( video_url: URL of the video file to fetch. video_io_kwargs: Additional kwargs passed to handle video IO. """ - media_io_kwargs = None if not video_io_kwargs else { - "video": video_io_kwargs - } + media_io_kwargs = None if not video_io_kwargs else {"video": video_io_kwargs} media_connector = MediaConnector(media_io_kwargs=media_io_kwargs) return media_connector.fetch_video(video_url) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 56097b852426..863bda9e529c 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -33,21 +33,47 @@ import uuid import warnings import weakref -from argparse import (Action, ArgumentDefaultsHelpFormatter, ArgumentParser, - ArgumentTypeError, RawDescriptionHelpFormatter, - _ArgumentGroup) +from argparse import ( + Action, + ArgumentDefaultsHelpFormatter, + ArgumentParser, + ArgumentTypeError, + RawDescriptionHelpFormatter, + _ArgumentGroup, +) from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task from collections import UserDict, defaultdict -from collections.abc import (AsyncGenerator, Awaitable, Collection, Generator, - Hashable, Iterable, Iterator, KeysView, Mapping, - Sequence) +from collections.abc import ( + AsyncGenerator, + Awaitable, + Collection, + Generator, + Hashable, + Iterable, + Iterator, + KeysView, + Mapping, + Sequence, +) from concurrent.futures import ThreadPoolExecutor from concurrent.futures.process import ProcessPoolExecutor from dataclasses import dataclass, field from functools import cache, lru_cache, partial, wraps from types import MappingProxyType -from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple, - Optional, TextIO, TypeVar, Union, cast, overload) +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Generic, + Literal, + NamedTuple, + Optional, + TextIO, + TypeVar, + Union, + cast, + overload, +) from urllib.parse import urlparse from uuid import uuid4 @@ -116,8 +142,8 @@ """The number of bytes in one gibibyte (GiB).""" # ANSI color codes -CYAN = '\033[1;36m' -RESET = '\033[0;0m' +CYAN = "\033[1;36m" +RESET = "\033[0;0m" STR_DTYPE_TO_TORCH_DTYPE = { "float32": torch.float32, @@ -150,7 +176,7 @@ def set_default_torch_num_threads(num_threads: int): torch.set_num_threads(old_num_threads) -P = ParamSpec('P') +P = ParamSpec("P") T = TypeVar("T") U = TypeVar("U") @@ -159,8 +185,7 @@ def set_default_torch_num_threads(num_threads: int): _T = TypeVar("_T") -class _Sentinel: - ... +class _Sentinel: ... ALL_PINNED_SENTINEL = _Sentinel() @@ -177,7 +202,6 @@ class LayerBlockType(enum.Enum): class Counter: - def __init__(self, start: int = 0) -> None: self.counter = start @@ -191,7 +215,6 @@ def reset(self) -> None: class _MappingOrderCacheView(UserDict[_K, _V]): - def __init__(self, data: Mapping[_K, _V], ordered_keys: Mapping[_K, None]): super().__init__(data) self.ordered_keys = ordered_keys @@ -222,10 +245,9 @@ def __sub__(self, other: CacheInfo): class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]): - - def __init__(self, - capacity: float, - getsizeof: Optional[Callable[[_V], float]] = None): + def __init__( + self, capacity: float, getsizeof: Optional[Callable[[_V], float]] = None + ): super().__init__(capacity, getsizeof) self.pinned_items = set[_K]() @@ -245,8 +267,7 @@ def __getitem__(self, key: _K, *, update_info: bool = True) -> _V: def __delitem__(self, key: _K) -> None: run_on_remove = key in self - value = self.__getitem__(key, - update_info=False) # type: ignore[call-arg] + value = self.__getitem__(key, update_info=False) # type: ignore[call-arg] super().__delitem__(key) if key in self.pinned_items: # Todo: add warning to inform that del pinned item @@ -259,7 +280,8 @@ def cache(self) -> Mapping[_K, _V]: """Return the internal cache dictionary in order (read-only).""" return _MappingOrderCacheView( self._Cache__data, # type: ignore - self.order) + self.order, + ) @property def order(self) -> Mapping[_K, None]: @@ -300,22 +322,17 @@ def touch(self, key: _K) -> None: self._LRUCache__order[key] = None # type: ignore @overload - def get(self, key: _K, /) -> Optional[_V]: - ... + def get(self, key: _K, /) -> Optional[_V]: ... @overload - def get(self, key: _K, /, default: Union[_V, _T]) -> Union[_V, _T]: - ... - - def get(self, - key: _K, - /, - default: Optional[Union[_V, - _T]] = None) -> Optional[Union[_V, _T]]: + def get(self, key: _K, /, default: Union[_V, _T]) -> Union[_V, _T]: ... + + def get( + self, key: _K, /, default: Optional[Union[_V, _T]] = None + ) -> Optional[Union[_V, _T]]: value: Optional[Union[_V, _T]] if key in self: - value = self.__getitem__( - key, update_info=False) # type: ignore[call-arg] + value = self.__getitem__(key, update_info=False) # type: ignore[call-arg] self._hits += 1 else: @@ -325,23 +342,19 @@ def get(self, return value @overload - def pop(self, key: _K) -> _V: - ... + def pop(self, key: _K) -> _V: ... @overload - def pop(self, key: _K, default: Union[_V, _T]) -> Union[_V, _T]: - ... + def pop(self, key: _K, default: Union[_V, _T]) -> Union[_V, _T]: ... - def pop(self, - key: _K, - default: Optional[Union[_V, - _T]] = None) -> Optional[Union[_V, _T]]: + def pop( + self, key: _K, default: Optional[Union[_V, _T]] = None + ) -> Optional[Union[_V, _T]]: value: Optional[Union[_V, _T]] if key not in self: return default - value = self.__getitem__(key, - update_info=False) # type: ignore[call-arg] + value = self.__getitem__(key, update_info=False) # type: ignore[call-arg] self.__delitem__(key) return value @@ -383,10 +396,12 @@ def popitem(self, remove_pinned: bool = False): # pop the oldest item in the cache that is not pinned lru_key = next( (key for key in self.order if key not in self.pinned_items), - ALL_PINNED_SENTINEL) + ALL_PINNED_SENTINEL, + ) if lru_key is ALL_PINNED_SENTINEL: - raise RuntimeError("All items are pinned, " - "cannot remove oldest from the cache.") + raise RuntimeError( + "All items are pinned, cannot remove oldest from the cache." + ) else: lru_key = next(iter(self.order)) value = self.pop(cast(_K, lru_key)) @@ -434,8 +449,7 @@ def get_object(self): return obj def reset(self): - """Makes all cached-objects available for the next scheduler iteration. - """ + """Makes all cached-objects available for the next scheduler iteration.""" self._index = 0 @@ -443,8 +457,8 @@ def reset(self): def get_max_shared_memory_bytes(gpu: int = 0) -> int: """Returns the maximum shared memory per thread block in bytes.""" from vllm import _custom_ops as ops - max_shared_mem = ( - ops.get_max_shared_memory_per_block_device_attribute(gpu)) + + max_shared_mem = ops.get_max_shared_memory_per_block_device_attribute(gpu) # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py # will fail assert max_shared_mem > 0, "max_shared_mem can not be zero" @@ -479,11 +493,14 @@ def __init__( self.batch_wait_timeout_s = batch_wait_timeout_s self._loop = asyncio.get_running_loop() - self._queues: dict[tuple, - asyncio.Queue[Union[tuple[str, dict, - asyncio.Future], - tuple[list[int], - asyncio.Future]]]] = {} + self._queues: dict[ + tuple, + asyncio.Queue[ + Union[ + tuple[str, dict, asyncio.Future], tuple[list[int], asyncio.Future] + ] + ], + ] = {} self._batcher_tasks: list[asyncio.Task] = [] # Single-thread executor for blocking tokenizer calls. @@ -507,8 +524,9 @@ async def decode(self, token_ids, **kwargs): # === Internal helpers === def _get_queue( self, loop: asyncio.AbstractEventLoop, key: tuple - ) -> asyncio.Queue[Union[tuple[str, dict, asyncio.Future], tuple[ - list[int], asyncio.Future]]]: + ) -> asyncio.Queue[ + Union[tuple[str, dict, asyncio.Future], tuple[list[int], asyncio.Future]] + ]: """Get the request queue for the given operation key, creating a new queue and batcher task if needed.""" queue = self._queues.get(key) @@ -518,8 +536,7 @@ def _get_queue( can_batch = key[1] != "other" coro = self._batch_encode_loop(queue, can_batch) else: - assert key[0] == "decode", \ - f"Unknown operation type: {key[0]}." + assert key[0] == "decode", f"Unknown operation type: {key[0]}." coro = self._batch_decode_loop(queue) self._batcher_tasks.append(loop.create_task(coro)) return queue @@ -539,7 +556,8 @@ async def _batch_encode_loop(self, queue: asyncio.Queue, can_batch: bool): break try: prompt, kwargs, result_future = await asyncio.wait_for( - queue.get(), timeout) + queue.get(), timeout + ) prompts.append(prompt) result_futures.append(result_future) if not can_batch: @@ -551,10 +569,10 @@ async def _batch_encode_loop(self, queue: asyncio.Queue, can_batch: bool): # If every request uses identical kwargs we can run a single # batched tokenizer call for a big speed-up. if can_batch and len(prompts) > 1: - batch_encode_fn = partial(self.tokenizer, prompts, - **kwargs) + batch_encode_fn = partial(self.tokenizer, prompts, **kwargs) results = await self._loop.run_in_executor( - self._executor, batch_encode_fn) + self._executor, batch_encode_fn + ) for i, fut in enumerate(result_futures): if not fut.done(): @@ -562,11 +580,11 @@ async def _batch_encode_loop(self, queue: asyncio.Queue, can_batch: bool): fut.set_result(BatchEncoding(data)) else: encode_fn = lambda prompts=prompts, kwargs=kwargs_list: [ - self.tokenizer(p, **kw) - for p, kw in zip(prompts, kwargs) + self.tokenizer(p, **kw) for p, kw in zip(prompts, kwargs) ] results = await self._loop.run_in_executor( - self._executor, encode_fn) + self._executor, encode_fn + ) for fut, res in zip(result_futures, results): if not fut.done(): @@ -590,7 +608,8 @@ async def _batch_decode_loop(self, queue: asyncio.Queue): break try: token_ids, result_future = await asyncio.wait_for( - queue.get(), timeout) + queue.get(), timeout + ) token_ids_list.append(token_ids) result_futures.append(result_future) except asyncio.TimeoutError: @@ -599,8 +618,8 @@ async def _batch_decode_loop(self, queue: asyncio.Queue): try: # Perform a single batched decode call for all requests results = await self._loop.run_in_executor( - self._executor, self.tokenizer.batch_decode, - token_ids_list) + self._executor, self.tokenizer.batch_decode, token_ids_list + ) for fut, res in zip(result_futures, results): if not fut.done(): fut.set_result(res) @@ -629,7 +648,7 @@ def _queue_key(self, op: str, kwargs: dict) -> tuple: """ if op == "decode": - return ("decode", ) + return ("decode",) add_special_tokens = kwargs.get("add_special_tokens", True) truncation = kwargs.get("truncation", False) @@ -639,16 +658,17 @@ def _queue_key(self, op: str, kwargs: dict) -> tuple: return "encode", add_special_tokens, False, None model_max = getattr(self.tokenizer, "model_max_length", None) - if max_length is None or (model_max is not None - and max_length == model_max): + if max_length is None or (model_max is not None and max_length == model_max): return "encode", add_special_tokens, True, "model_max" return "encode", "other" def __del__(self): - if ((tasks := getattr(self, "_batcher_tasks", None)) - and (loop := getattr(self, "_loop", None)) - and not loop.is_closed()): + if ( + (tasks := getattr(self, "_batcher_tasks", None)) + and (loop := getattr(self, "_loop", None)) + and not loop.is_closed() + ): def cancel_tasks(): for task in tasks: @@ -683,8 +703,7 @@ def in_loop(event_loop: AbstractEventLoop) -> bool: def make_async( - func: Callable[P, T], - executor: Optional[concurrent.futures.Executor] = None + func: Callable[P, T], executor: Optional[concurrent.futures.Executor] = None ) -> Callable[P, Awaitable[T]]: """Take a blocking function, and run it on in an executor thread. @@ -701,15 +720,14 @@ def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> asyncio.Future: return _async_wrapper -def _next_task(iterator: AsyncGenerator[T, None], - loop: AbstractEventLoop) -> Task: +def _next_task(iterator: AsyncGenerator[T, None], loop: AbstractEventLoop) -> Task: # Can use anext() in python >= 3.10 return loop.create_task(iterator.__anext__()) # type: ignore[arg-type] async def merge_async_iterators( - *iterators: AsyncGenerator[T, - None], ) -> AsyncGenerator[tuple[int, T], None]: + *iterators: AsyncGenerator[T, None], +) -> AsyncGenerator[tuple[int, T], None]: """Merge multiple asynchronous iterators into a single iterator. This method handle the case where some iterators finish before others. @@ -727,8 +745,7 @@ async def merge_async_iterators( awaits = {_next_task(pair[1], loop): pair for pair in enumerate(iterators)} try: while awaits: - done, _ = await asyncio.wait(awaits.keys(), - return_when=FIRST_COMPLETED) + done, _ = await asyncio.wait(awaits.keys(), return_when=FIRST_COMPLETED) for d in done: pair = awaits.pop(d) try: @@ -746,8 +763,7 @@ async def merge_async_iterators( await it.aclose() -async def collect_from_async_generator( - iterator: AsyncGenerator[T, None]) -> list[T]: +async def collect_from_async_generator(iterator: AsyncGenerator[T, None]) -> list[T]: """Collect all items from an async generator into a list.""" items = [] async for item in iterator: @@ -763,7 +779,8 @@ def get_ip() -> str: " it is often used by Docker and other software to" " interact with the container's network stack. Please " "use VLLM_HOST_IP instead to set the IP address for vLLM processes" - " to communicate with each other.") + " to communicate with each other." + ) if host_ip: return host_ip @@ -791,7 +808,8 @@ def get_ip() -> str: "Failed to get the IP address, using 0.0.0.0 by default." "The value can be set by the environment variable" " VLLM_HOST_IP or HOST_IP.", - stacklevel=2) + stacklevel=2, + ) return "0.0.0.0" @@ -819,7 +837,8 @@ def get_loopback_ip() -> str: else: raise RuntimeError( "Neither 127.0.0.1 nor ::1 are bound to a local interface. " - "Set the VLLM_LOOPBACK_IP environment variable explicitly.") + "Set the VLLM_LOOPBACK_IP environment variable explicitly." + ) def is_valid_ipv6_address(address: str) -> bool: @@ -832,13 +851,13 @@ def is_valid_ipv6_address(address: str) -> bool: def split_host_port(host_port: str) -> tuple[str, int]: # ipv6 - if host_port.startswith('['): - host, port = host_port.rsplit(']', 1) + if host_port.startswith("["): + host, port = host_port.rsplit("]", 1) host = host[1:] - port = port.split(':')[1] + port = port.split(":")[1] return host, int(port) else: - host, port = host_port.split(':') + host, port = host_port.split(":") return host, int(port) @@ -906,8 +925,7 @@ def _get_open_port() -> int: return port except OSError: port += 1 # Increment port number if already in use - logger.info("Port %d is already in use, trying port %d", - port - 1, port) + logger.info("Port %d is already in use, trying port %d", port - 1, port) # try ipv4 try: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: @@ -930,8 +948,7 @@ def find_process_using_port(port: int) -> Optional[psutil.Process]: our_pid = os.getpid() for conn in psutil.net_connections(): - if conn.laddr.port == port and (conn.pid is not None - and conn.pid != our_pid): + if conn.laddr.port == port and (conn.pid is not None and conn.pid != our_pid): try: return psutil.Process(conn.pid) except psutil.NoSuchProcess: @@ -943,15 +960,18 @@ def update_environment_variables(envs: dict[str, str]): for k, v in envs.items(): if k in os.environ and os.environ[k] != v: logger.warning( - "Overwriting environment variable %s " - "from '%s' to '%s'", k, os.environ[k], v) + "Overwriting environment variable %s from '%s' to '%s'", + k, + os.environ[k], + v, + ) os.environ[k] = v def chunk_list(lst: list[T], chunk_size: int): """Yield successive chunk_size chunks from lst.""" for i in range(0, len(lst), chunk_size): - yield lst[i:i + chunk_size] + yield lst[i : i + chunk_size] def cdiv(a: int, b: int) -> int: @@ -995,6 +1015,7 @@ def _generate_random_fp8( # Inf | N/A | s.11111.00 # NaN | s.1111.111 | s.11111.{01,10,11} from vllm import _custom_ops as ops + tensor_tmp = torch.empty_like(tensor, dtype=torch.float16) tensor_tmp.uniform_(low, high) ops.convert_fp8(tensor, tensor_tmp) @@ -1002,12 +1023,12 @@ def _generate_random_fp8( def get_kv_cache_torch_dtype( - cache_dtype: Optional[Union[str, torch.dtype]], - model_dtype: Optional[Union[str, torch.dtype]] = None) -> torch.dtype: + cache_dtype: Optional[Union[str, torch.dtype]], + model_dtype: Optional[Union[str, torch.dtype]] = None, +) -> torch.dtype: if isinstance(cache_dtype, str): if cache_dtype == "auto": - if isinstance(model_dtype, - str) and model_dtype in STR_DTYPE_TO_TORCH_DTYPE: + if isinstance(model_dtype, str) and model_dtype in STR_DTYPE_TO_TORCH_DTYPE: torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[model_dtype] elif isinstance(model_dtype, torch.dtype): torch_dtype = model_dtype @@ -1037,32 +1058,30 @@ def create_kv_caches_with_random_flash( cache_layout: Optional[str] = "NHD", ) -> tuple[list[torch.Tensor], list[torch.Tensor]]: from vllm.platforms import current_platform + current_platform.seed_everything(seed) torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype) generic_kv_cache_shape = (num_blocks, 2, block_size, num_heads, head_size) assert cache_layout in ("NHD", "HND") - stride_order = (0, 1, 2, 3, 4) if cache_layout == "NHD" else (0, 1, 3, 2, - 4) + stride_order = (0, 1, 2, 3, 4) if cache_layout == "NHD" else (0, 1, 3, 2, 4) - kv_cache_allocation_shape = tuple(generic_kv_cache_shape[i] - for i in stride_order) + kv_cache_allocation_shape = tuple(generic_kv_cache_shape[i] for i in stride_order) scale = head_size**-0.5 key_caches: list[torch.Tensor] = [] value_caches: list[torch.Tensor] = [] for _ in range(num_layers): - key_value_cache = torch.empty(size=kv_cache_allocation_shape, - dtype=torch_dtype, - device=device).permute(*stride_order) + key_value_cache = torch.empty( + size=kv_cache_allocation_shape, dtype=torch_dtype, device=device + ).permute(*stride_order) if cache_dtype in ["auto", "half", "bfloat16", "float"]: key_value_cache.uniform_(-scale, scale) - elif cache_dtype == 'fp8': + elif cache_dtype == "fp8": _generate_random_fp8(key_value_cache, -scale, scale) else: - raise ValueError( - f"Does not support key cache of type {cache_dtype}") + raise ValueError(f"Does not support key cache of type {cache_dtype}") key_caches.append(key_value_cache[:, 0]) value_caches.append(key_value_cache[:, 1]) return key_caches, value_caches @@ -1084,6 +1103,7 @@ def create_kv_caches_with_random( f"Does not support key cache of type fp8 with head_size {head_size}" ) from vllm.platforms import current_platform + current_platform.seed_everything(seed) torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype) @@ -1093,31 +1113,27 @@ def create_kv_caches_with_random( key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x) key_caches: list[torch.Tensor] = [] for _ in range(num_layers): - key_cache = torch.empty(size=key_cache_shape, - dtype=torch_dtype, - device=device) + key_cache = torch.empty(size=key_cache_shape, dtype=torch_dtype, device=device) if cache_dtype in ["auto", "half", "bfloat16", "float"]: key_cache.uniform_(-scale, scale) - elif cache_dtype == 'fp8': + elif cache_dtype == "fp8": _generate_random_fp8(key_cache, -scale, scale) else: - raise ValueError( - f"Does not support key cache of type {cache_dtype}") + raise ValueError(f"Does not support key cache of type {cache_dtype}") key_caches.append(key_cache) value_cache_shape = (num_blocks, num_heads, head_size, block_size) value_caches: list[torch.Tensor] = [] for _ in range(num_layers): - value_cache = torch.empty(size=value_cache_shape, - dtype=torch_dtype, - device=device) + value_cache = torch.empty( + size=value_cache_shape, dtype=torch_dtype, device=device + ) if cache_dtype in ["auto", "half", "bfloat16", "float"]: value_cache.uniform_(-scale, scale) - elif cache_dtype == 'fp8': + elif cache_dtype == "fp8": _generate_random_fp8(value_cache, -scale, scale) else: - raise ValueError( - f"Does not support value cache of type {cache_dtype}") + raise ValueError(f"Does not support value cache of type {cache_dtype}") value_caches.append(value_cache) return key_caches, value_caches @@ -1125,6 +1141,7 @@ def create_kv_caches_with_random( @cache def is_pin_memory_available() -> bool: from vllm.platforms import current_platform + return current_platform.is_pin_memory_available() @@ -1137,13 +1154,13 @@ def is_uva_available() -> bool: class DeviceMemoryProfiler: - def __init__(self, device: Optional[torch.types.Device] = None): self.device = device def current_memory_usage(self) -> float: # Return the memory usage in bytes. from vllm.platforms import current_platform + gc.collect() return current_platform.get_current_memory_usage(self.device) @@ -1180,7 +1197,7 @@ def make_ndarray_with_pad( padded_x = np.full((len(x), max_len), pad, dtype=dtype) for ind, blocktb in enumerate(x): assert len(blocktb) <= max_len - padded_x[ind, :len(blocktb)] = blocktb + padded_x[ind, : len(blocktb)] = blocktb return padded_x @@ -1229,8 +1246,7 @@ def get_dtype_size(dtype: torch.dtype) -> int: # bool = 0, int = 1, float = 2, complex = 3 def _get_precision_level(dtype: torch.dtype) -> int: # NOTE: Complex dtypes return `is_floating_point=False` - return ((dtype != torch.bool) + dtype.is_floating_point + - dtype.is_complex * 2) + return (dtype != torch.bool) + dtype.is_floating_point + dtype.is_complex * 2 def is_lossless_cast(src_dtype: torch.dtype, tgt_dtype: torch.dtype): @@ -1258,8 +1274,11 @@ def is_lossless_cast(src_dtype: torch.dtype, tgt_dtype: torch.dtype): # Compare floating-point types src_info = torch.finfo(src_dtype) tgt_info = torch.finfo(tgt_dtype) - return (src_info.min >= tgt_info.min and src_info.max <= tgt_info.max - and src_info.resolution >= tgt_info.resolution) + return ( + src_info.min >= tgt_info.min + and src_info.max <= tgt_info.max + and src_info.resolution >= tgt_info.resolution + ) def common_broadcastable_dtype(dtypes: Collection[torch.dtype]): @@ -1327,6 +1346,7 @@ def init_cached_hf_modules() -> None: Lazy initialization of the Hugging Face modules. """ from transformers.dynamic_module_utils import init_hf_modules + init_hf_modules() @@ -1370,8 +1390,8 @@ def find_nccl_library() -> str: # manually load the nccl library if so_file: logger.info( - "Found nccl from environment variable VLLM_NCCL_SO_PATH=%s", - so_file) + "Found nccl from environment variable VLLM_NCCL_SO_PATH=%s", so_file + ) else: if torch.version.cuda is not None: so_file = "libnccl.so.2" @@ -1386,8 +1406,8 @@ def find_nccl_library() -> str: def find_nccl_include_paths() -> Optional[list[str]]: """ We either use the nccl.h specified by the `VLLM_NCCL_INCLUDE_PATH` - environment variable, or we find the library file brought by - nvidia-nccl-cuXX. load_inline by default uses + environment variable, or we find the library file brought by + nvidia-nccl-cuXX. load_inline by default uses torch.utils.cpp_extension.include_paths """ paths: list[str] = [] @@ -1397,6 +1417,7 @@ def find_nccl_include_paths() -> Optional[list[str]]: try: import importlib.util + spec = importlib.util.find_spec("nvidia.nccl") if spec and getattr(spec, "submodule_search_locations", None): for loc in spec.submodule_search_locations: @@ -1429,7 +1450,6 @@ def _patched_set_stream(stream: torch.cuda.Stream) -> None: class _StreamPlaceholder: - def __init__(self): self.synchronize = lambda: None @@ -1446,8 +1466,8 @@ def current_stream() -> torch.cuda.Stream: from C/C++ code. """ from vllm.platforms import current_platform - if not hasattr(_current_stream_tls, - "value") or _current_stream_tls.value is None: + + if not hasattr(_current_stream_tls, "value") or _current_stream_tls.value is None: # when this function is called before any stream is set, # we return the default stream. # On ROCm using the default 0 stream in combination with RCCL @@ -1465,7 +1485,8 @@ def current_stream() -> torch.cuda.Stream: else: raise ValueError( "Fail to set current stream, current platform " - "may not support current_stream with torch API") + "may not support current_stream with torch API" + ) return _current_stream_tls.value @@ -1478,12 +1499,14 @@ def enable_trace_function_call_for_thread(vllm_config: VllmConfig) -> None: tmp_dir = tempfile.gettempdir() # add username to tmp_dir to avoid permission issues tmp_dir = os.path.join(tmp_dir, getpass.getuser()) - filename = (f"VLLM_TRACE_FUNCTION_for_process_{os.getpid()}" - f"_thread_{threading.get_ident()}_" - f"at_{datetime.datetime.now()}.log").replace(" ", "_") - log_path = os.path.join(tmp_dir, "vllm", - f"vllm-instance-{vllm_config.instance_id}", - filename) + filename = ( + f"VLLM_TRACE_FUNCTION_for_process_{os.getpid()}" + f"_thread_{threading.get_ident()}_" + f"at_{datetime.datetime.now()}.log" + ).replace(" ", "_") + log_path = os.path.join( + tmp_dir, "vllm", f"vllm-instance-{vllm_config.instance_id}", filename + ) os.makedirs(os.path.dirname(log_path), exist_ok=True) enable_trace_function_call(log_path) @@ -1494,7 +1517,7 @@ def identity(value: T, **kwargs) -> T: return value -F = TypeVar('F', bound=Callable[..., Any]) +F = TypeVar("F", bound=Callable[..., Any]) def deprecate_args( @@ -1506,24 +1529,22 @@ def deprecate_args( is_deprecated = partial(identity, is_deprecated) def wrapper(fn: F) -> F: - params = inspect.signature(fn).parameters pos_types = ( inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD, ) - pos_kws = [ - kw for kw, param in params.items() if param.kind in pos_types - ] + pos_kws = [kw for kw, param in params.items() if param.kind in pos_types] @wraps(fn) def inner(*args, **kwargs): if is_deprecated(): - deprecated_args = pos_kws[start_index:len(args)] + deprecated_args = pos_kws[start_index : len(args)] if deprecated_args: msg = ( f"The positional arguments {deprecated_args} are " - "deprecated and will be removed in a future update.") + "deprecated and will be removed in a future update." + ) if additional_message is not None: msg += f" {additional_message}" @@ -1550,7 +1571,6 @@ def deprecate_kwargs( is_deprecated = partial(identity, is_deprecated) def wrapper(fn: F) -> F: - @wraps(fn) def inner(*args, **kwargs): if is_deprecated(): @@ -1558,7 +1578,8 @@ def inner(*args, **kwargs): if deprecated_kwargs: msg = ( f"The keyword arguments {deprecated_kwargs} are " - "deprecated and will be removed in a future update.") + "deprecated and will be removed in a future update." + ) if additional_message is not None: msg += f" {additional_message}" @@ -1575,8 +1596,7 @@ def inner(*args, **kwargs): @lru_cache(maxsize=8) -def _cuda_device_count_stateless( - cuda_visible_devices: Optional[str] = None) -> int: +def _cuda_device_count_stateless(cuda_visible_devices: Optional[str] = None) -> int: # Note: cuda_visible_devices is not used, but we keep it as an argument for # LRU Cache purposes. @@ -1588,13 +1608,17 @@ def _cuda_device_count_stateless( import torch.version from vllm.platforms import current_platform + if not torch.cuda._is_compiled(): return 0 if current_platform.is_rocm(): # ROCm uses amdsmi instead of nvml for stateless device count # This requires a sufficiently modern version of Torch 2.4.0 - raw_count = torch.cuda._device_count_amdsmi() if (hasattr( - torch.cuda, "_device_count_amdsmi")) else -1 + raw_count = ( + torch.cuda._device_count_amdsmi() + if (hasattr(torch.cuda, "_device_count_amdsmi")) + else -1 + ) else: raw_count = torch.cuda._device_count_nvml() r = torch._C._cuda_getDeviceCount() if raw_count < 0 else raw_count @@ -1628,9 +1652,9 @@ def xpu_is_initialized() -> bool: return torch.xpu.is_initialized() -def cuda_get_device_properties(device, - names: Sequence[str], - init_cuda=False) -> tuple[Any, ...]: +def cuda_get_device_properties( + device, names: Sequence[str], init_cuda=False +) -> tuple[Any, ...]: """Get specified CUDA device property values without initializing CUDA in the current process.""" if init_cuda or cuda_is_initialized(): @@ -1640,11 +1664,12 @@ def cuda_get_device_properties(device, # Run in subprocess to avoid initializing CUDA as a side effect. mp_ctx = multiprocessing.get_context("fork") with ProcessPoolExecutor(max_workers=1, mp_context=mp_ctx) as executor: - return executor.submit(cuda_get_device_properties, device, names, - True).result() + return executor.submit(cuda_get_device_properties, device, names, True).result() -def weak_bind(bound_method: Callable[..., Any], ) -> Callable[..., None]: +def weak_bind( + bound_method: Callable[..., Any], +) -> Callable[..., None]: """Make an instance method that weakly references its associated instance and no-ops once that instance is collected.""" @@ -1659,7 +1684,6 @@ def weak_bound(*args, **kwargs) -> None: def run_once(f: Callable[P, None]) -> Callable[P, None]: - def wrapper(*args: P.args, **kwargs: P.kwargs) -> None: if wrapper.has_run: # type: ignore[attr-defined] return @@ -1675,19 +1699,18 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> None: class StoreBoolean(Action): - def __call__(self, parser, namespace, values, option_string=None): if values.lower() == "true": setattr(namespace, self.dest, True) elif values.lower() == "false": setattr(namespace, self.dest, False) else: - raise ValueError(f"Invalid boolean value: {values}. " - "Expected 'true' or 'false'.") + raise ValueError( + f"Invalid boolean value: {values}. Expected 'true' or 'false'." + ) -class SortedHelpFormatter(ArgumentDefaultsHelpFormatter, - RawDescriptionHelpFormatter): +class SortedHelpFormatter(ArgumentDefaultsHelpFormatter, RawDescriptionHelpFormatter): """SortedHelpFormatter that sorts arguments by their option strings.""" def _split_lines(self, text, width): @@ -1699,7 +1722,7 @@ def _split_lines(self, text, width): # The patterns also include whitespace after the newline single_newline = re.compile(r"(? str: # Add tip about JSON arguments to the epilog epilog = self.epilog or "" - if (self.add_json_tip - and not epilog.startswith(FlexibleArgumentParser._json_tip)): + if self.add_json_tip and not epilog.startswith( + FlexibleArgumentParser._json_tip + ): self.epilog = FlexibleArgumentParser._json_tip + epilog return super().format_help() @@ -1786,15 +1813,16 @@ def parse_args( # type: ignore[override] # Check for --model in command line arguments first if args and args[0] == "serve": - model_in_cli_args = any(arg == '--model' for arg in args) + model_in_cli_args = any(arg == "--model" for arg in args) if model_in_cli_args: raise ValueError( "With `vllm serve`, you should provide the model as a " "positional argument or in a config file instead of via " - "the `--model` option.") + "the `--model` option." + ) - if '--config' in args: + if "--config" in args: args = self._pull_args_from_config(args) def repl(match: re.Match) -> str: @@ -1807,25 +1835,27 @@ def repl(match: re.Match) -> str: # Convert underscores to dashes and vice versa in argument names processed_args = list[str]() for i, arg in enumerate(args): - if arg.startswith('--'): - if '=' in arg: - key, value = arg.split('=', 1) + if arg.startswith("--"): + if "=" in arg: + key, value = arg.split("=", 1) key = pattern.sub(repl, key, count=1) - processed_args.append(f'{key}={value}') + processed_args.append(f"{key}={value}") else: key = pattern.sub(repl, arg, count=1) processed_args.append(key) - elif arg.startswith('-O') and arg != '-O' and arg[2] != '.': + elif arg.startswith("-O") and arg != "-O" and arg[2] != ".": # allow -O flag to be used without space, e.g. -O3 or -Odecode # -O.<...> handled later # also handle -O= here - level = arg[3:] if arg[2] == '=' else arg[2:] - processed_args.append(f'-O.level={level}') - elif arg == '-O' and i + 1 < len(args) and args[i + 1] in { - "0", "1", "2", "3" - }: + level = arg[3:] if arg[2] == "=" else arg[2:] + processed_args.append(f"-O.level={level}") + elif ( + arg == "-O" + and i + 1 < len(args) + and args[i + 1] in {"0", "1", "2", "3"} + ): # Convert -O to -O.level - processed_args.append('-O.level') + processed_args.append("-O.level") else: processed_args.append(arg) @@ -1889,14 +1919,11 @@ def recursive_dict_update( # Merge all values with the same key into a single dict arg_dict = create_nested_dict(keys, value) - arg_duplicates = recursive_dict_update(dict_args[key], - arg_dict) - duplicates |= {f'{key}.{d}' for d in arg_duplicates} + arg_duplicates = recursive_dict_update(dict_args[key], arg_dict) + duplicates |= {f"{key}.{d}" for d in arg_duplicates} delete.add(i) # Filter out the dict args we set to None - processed_args = [ - a for i, a in enumerate(processed_args) if i not in delete - ] + processed_args = [a for i, a in enumerate(processed_args) if i not in delete] if duplicates: logger.warning("Found duplicate keys %s", ", ".join(duplicates)) @@ -1953,13 +1980,14 @@ def _pull_args_from_config(self, args: list[str]) -> list[str]: this way the order of priorities is maintained when these are args parsed by super(). """ - assert args.count( - '--config') <= 1, "More than one config file specified!" + assert args.count("--config") <= 1, "More than one config file specified!" - index = args.index('--config') + index = args.index("--config") if index == len(args) - 1: - raise ValueError("No config file specified! \ - Please check your command-line arguments.") + raise ValueError( + "No config file specified! \ + Please check your command-line arguments." + ) file_path = args[index + 1] @@ -1971,29 +1999,33 @@ def _pull_args_from_config(self, args: list[str]) -> list[str]: # followed by rest of cli args. # maintaining this order will enforce the precedence # of cli > config > defaults - if args[0].startswith('-'): + if args[0].startswith("-"): # No sub command (e.g., api_server entry point) - args = config_args + args[0:index] + args[index + 2:] + args = config_args + args[0:index] + args[index + 2 :] elif args[0] == "serve": - model_in_cli = len(args) > 1 and not args[1].startswith('-') - model_in_config = any(arg == '--model' for arg in config_args) + model_in_cli = len(args) > 1 and not args[1].startswith("-") + model_in_config = any(arg == "--model" for arg in config_args) if not model_in_cli and not model_in_config: raise ValueError( "No model specified! Please specify model either " - "as a positional argument or in a config file.") + "as a positional argument or in a config file." + ) if model_in_cli: # Model specified as positional arg, keep CLI version - args = [args[0]] + [ - args[1] - ] + config_args + args[2:index] + args[index + 2:] + args = ( + [args[0]] + + [args[1]] + + config_args + + args[2:index] + + args[index + 2 :] + ) else: # No model in CLI, use config if available - args = [args[0] - ] + config_args + args[1:index] + args[index + 2:] + args = [args[0]] + config_args + args[1:index] + args[index + 2 :] else: - args = [args[0]] + config_args + args[1:index] + args[index + 2:] + args = [args[0]] + config_args + args[1:index] + args[index + 2 :] return args @@ -2010,11 +2042,13 @@ def load_config_file(self, file_path: str) -> list[str]: '--tensor-parallel-size': '4' ] """ - extension: str = file_path.split('.')[-1] - if extension not in ('yaml', 'yml'): + extension: str = file_path.split(".")[-1] + if extension not in ("yaml", "yml"): raise ValueError( "Config file must be of a yaml/yml type.\ - %s supplied", extension) + %s supplied", + extension, + ) # only expecting a flat dictionary of atomic types processed_args: list[str] = [] @@ -2026,32 +2060,32 @@ def load_config_file(self, file_path: str) -> list[str]: except Exception as ex: logger.error( "Unable to read the config file at %s. \ - Make sure path is correct", file_path) + Make sure path is correct", + file_path, + ) raise ex store_boolean_arguments = [ - action.dest for action in self._actions - if isinstance(action, StoreBoolean) + action.dest for action in self._actions if isinstance(action, StoreBoolean) ] for key, value in config.items(): if isinstance(value, bool) and key not in store_boolean_arguments: if value: - processed_args.append('--' + key) + processed_args.append("--" + key) elif isinstance(value, list): if value: - processed_args.append('--' + key) + processed_args.append("--" + key) for item in value: processed_args.append(str(item)) else: - processed_args.append('--' + key) + processed_args.append("--" + key) processed_args.append(str(value)) return processed_args -async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args, - **kwargs): +async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args, **kwargs): """Utility function to run async task in a lock""" async with lock: return await task(*args, **kwargs) @@ -2075,19 +2109,26 @@ def supports_kw( param_val = params.get(kw_name) # Types where the it may be valid, i.e., explicitly defined & nonvariadic - passable_kw_types = set((inspect.Parameter.POSITIONAL_ONLY, - inspect.Parameter.POSITIONAL_OR_KEYWORD, - inspect.Parameter.KEYWORD_ONLY)) + passable_kw_types = set( + ( + inspect.Parameter.POSITIONAL_ONLY, + inspect.Parameter.POSITIONAL_OR_KEYWORD, + inspect.Parameter.KEYWORD_ONLY, + ) + ) if param_val: is_sig_param = param_val.kind in passable_kw_types # We want kwargs only, but this is passable as a positional arg - if (requires_kw_only and is_sig_param - and param_val.kind != inspect.Parameter.KEYWORD_ONLY): + if ( + requires_kw_only + and is_sig_param + and param_val.kind != inspect.Parameter.KEYWORD_ONLY + ): return False - if ((requires_kw_only - and param_val.kind == inspect.Parameter.KEYWORD_ONLY) - or (not requires_kw_only and is_sig_param)): + if (requires_kw_only and param_val.kind == inspect.Parameter.KEYWORD_ONLY) or ( + not requires_kw_only and is_sig_param + ): return True # If we're okay with var-kwargs, it's supported as long as @@ -2097,8 +2138,10 @@ def supports_kw( # mapping, but it wraps an ordered dict, and they appear in order. # Ref: https://docs.python.org/3/library/inspect.html#inspect.Signature.parameters last_param = params[next(reversed(params))] # type: ignore - return (last_param.kind == inspect.Parameter.VAR_KEYWORD - and last_param.name != kw_name) + return ( + last_param.kind == inspect.Parameter.VAR_KEYWORD + and last_param.name != kw_name + ) return False @@ -2137,10 +2180,12 @@ def get_allowed_kwarg_only_overrides( filtered_overrides = { kwarg_name: val for kwarg_name, val in overrides.items() - if supports_kw(callable, - kwarg_name, - requires_kw_only=requires_kw_only, - allow_var_kwargs=allow_var_kwargs) + if supports_kw( + callable, + kwarg_name, + requires_kw_only=requires_kw_only, + allow_var_kwargs=allow_var_kwargs, + ) } # If anything is dropped, log a warning @@ -2149,11 +2194,15 @@ def get_allowed_kwarg_only_overrides( if requires_kw_only: logger.warning( "The following intended overrides are not keyword-only args " - "and will be dropped: %s", dropped_keys) + "and will be dropped: %s", + dropped_keys, + ) else: logger.warning( "The following intended overrides are not keyword args " - "and will be dropped: %s", dropped_keys) + "and will be dropped: %s", + dropped_keys, + ) return filtered_overrides @@ -2168,8 +2217,9 @@ def supports_dynamo() -> bool: # Supports xccl with PyTorch versions >= 2.8.0.dev for XPU platform def supports_xccl() -> bool: - return is_torch_equal_or_newer( - "2.8.0.dev") and torch.distributed.is_xccl_available() + return ( + is_torch_equal_or_newer("2.8.0.dev") and torch.distributed.is_xccl_available() + ) # Some backends use pytorch version < 2.4.0 which doesn't @@ -2205,7 +2255,6 @@ def value(self): # Adapted from: https://stackoverflow.com/a/47212782/5082708 class LazyDict(Mapping[str, T], Generic[T]): - def __init__(self, factory: dict[str, Callable[[], T]]): self._factory = factory self._dict: dict[str, T] = {} @@ -2228,7 +2277,6 @@ def __len__(self): class ClassRegistry(UserDict[type[T], _V]): - def __getitem__(self, key: type[T]) -> _V: for cls in key.mro(): if cls in self.data: @@ -2262,8 +2310,9 @@ def weak_ref_tensor(tensor: Any) -> Any: def weak_ref_tensors( - tensors: Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor], - IntermediateTensors] + tensors: Union[ + torch.Tensor, list[torch.Tensor], tuple[torch.Tensor], IntermediateTensors + ], ) -> Union[torch.Tensor, list[Any], tuple[Any], Any]: """ Convenience function to create weak references to tensors, @@ -2278,11 +2327,11 @@ def weak_ref_tensors( # For IntermediateTensors used in pipeline parallelism from vllm.sequence import IntermediateTensors + if isinstance(tensors, IntermediateTensors): - ret = IntermediateTensors({ - key: weak_ref_tensor(val) - for key, val in tensors.tensors.items() - }) + ret = IntermediateTensors( + {key: weak_ref_tensor(val) for key, val in tensors.tensors.items()} + ) return ret raise ValueError("Invalid type for tensors") @@ -2322,7 +2371,8 @@ def get_vllm_optional_dependencies(): return { extra: [ - re.split(r";|>=|<=|==", req)[0] for req in requirements + re.split(r";|>=|<=|==", req)[0] + for req in requirements if req.endswith(f'extra == "{extra}"') ] for extra in extras @@ -2515,12 +2565,13 @@ def __getattr__(self, key: str): raise exc - raise AssertionError("PlaceholderModule should not be used " - "when the original module can be imported") + raise AssertionError( + "PlaceholderModule should not be used " + "when the original module can be imported" + ) class _PlaceholderModuleAttr(_PlaceholderBase): - def __init__(self, module: PlaceholderModule, attr_path: str) -> None: super().__init__() @@ -2529,14 +2580,15 @@ def __init__(self, module: PlaceholderModule, attr_path: str) -> None: self.__attr_path = attr_path def placeholder_attr(self, attr_path: str): - return _PlaceholderModuleAttr(self.__module, - f"{self.__attr_path}.{attr_path}") + return _PlaceholderModuleAttr(self.__module, f"{self.__attr_path}.{attr_path}") def __getattr__(self, key: str): getattr(self.__module, f"{self.__attr_path}.{key}") - raise AssertionError("PlaceholderModule should not be used " - "when the original module can be imported") + raise AssertionError( + "PlaceholderModule should not be used " + "when the original module can be imported" + ) # create a library to hold the custom op @@ -2544,13 +2596,13 @@ def __getattr__(self, key: str): def direct_register_custom_op( - op_name: str, - op_func: Callable, - mutates_args: Optional[list[str]] = None, - fake_impl: Optional[Callable] = None, - target_lib: Optional[Library] = None, - dispatch_key: Optional[str] = None, - tags: tuple[torch.Tag, ...] = (), + op_name: str, + op_func: Callable, + mutates_args: Optional[list[str]] = None, + fake_impl: Optional[Callable] = None, + target_lib: Optional[Library] = None, + dispatch_key: Optional[str] = None, + tags: tuple[torch.Tag, ...] = (), ): """ `torch.library.custom_op` can have significant overhead because it @@ -2569,12 +2621,14 @@ def direct_register_custom_op( """ if not supports_custom_op(): from vllm.platforms import current_platform + assert not current_platform.is_cuda_alike(), ( "cuda platform needs torch>=2.4 to support custom op, " "chances are you are using an old version of pytorch " "or a custom build of pytorch. It is recommended to " "use vLLM in a fresh new environment and let it install " - "the required dependencies.") + "the required dependencies." + ) return if mutates_args is None: @@ -2582,15 +2636,17 @@ def direct_register_custom_op( if dispatch_key is None: from vllm.platforms import current_platform + dispatch_key = current_platform.dispatch_key import torch.library + if hasattr(torch.library, "infer_schema"): - schema_str = torch.library.infer_schema(op_func, - mutates_args=mutates_args) + schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args) else: # for pytorch 2.4 import torch._custom_op.impl + schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args) my_lib = target_lib or vllm_lib my_lib.define(op_name + schema_str, tags=tags) @@ -2636,6 +2692,7 @@ def kill_process_tree(pid: int): @dataclass class MemorySnapshot: """Memory snapshot.""" + torch_peak: int = 0 free_memory: int = 0 total_memory: int = 0 @@ -2660,7 +2717,8 @@ def measure(self) -> None: # `torch.cuda.memory_reserved()` will keep growing, and only shrink # when we call `torch.cuda.empty_cache()` or OOM happens. self.torch_peak = torch.cuda.memory_stats(device).get( - "allocated_bytes.all.peak", 0) + "allocated_bytes.all.peak", 0 + ) self.free_memory, self.total_memory = torch.cuda.mem_get_info(device) self.cuda_memory = self.total_memory - self.free_memory @@ -2677,7 +2735,8 @@ def __sub__(self, other: MemorySnapshot) -> MemorySnapshot: if self.device != other.device: raise ValueError( "The two snapshots should be from the same device! " - f"Found: {self.device} vs. {other.device}") + f"Found: {self.device} vs. {other.device}" + ) return MemorySnapshot( torch_peak=self.torch_peak - other.torch_peak, @@ -2694,8 +2753,8 @@ def __sub__(self, other: MemorySnapshot) -> MemorySnapshot: @dataclass class MemoryProfilingResult: - """Memory profiling result. All numbers are in bytes. - """ + """Memory profiling result. All numbers are in bytes.""" + non_kv_cache_memory: int = 0 torch_peak_increase: int = 0 non_torch_increase: int = 0 @@ -2710,14 +2769,16 @@ def __post_init__(self) -> None: self.after_profile = MemorySnapshot(device=device, auto_measure=False) def __repr__(self) -> str: - return (f"Memory profiling takes {self.profile_time:.2f} seconds. " - f"Total non KV cache memory: " - f"{(self.non_kv_cache_memory / GiB_bytes):.2f}GiB; " - f"torch peak memory increase: " - f"{(self.torch_peak_increase / GiB_bytes):.2f}GiB; " - f"non-torch forward increase memory: " - f"{(self.non_torch_increase / GiB_bytes):.2f}GiB; " - f"weights memory: {(self.weights_memory / GiB_bytes):.2f}GiB.") + return ( + f"Memory profiling takes {self.profile_time:.2f} seconds. " + f"Total non KV cache memory: " + f"{(self.non_kv_cache_memory / GiB_bytes):.2f}GiB; " + f"torch peak memory increase: " + f"{(self.torch_peak_increase / GiB_bytes):.2f}GiB; " + f"non-torch forward increase memory: " + f"{(self.non_torch_increase / GiB_bytes):.2f}GiB; " + f"weights memory: {(self.weights_memory / GiB_bytes):.2f}GiB." + ) @contextlib.contextmanager @@ -2798,29 +2859,34 @@ def memory_profiling( non_torch_memory = result.non_torch_increase peak_activation_memory = result.torch_peak_increase - result.non_kv_cache_memory = non_torch_memory + peak_activation_memory + result.weights_memory # noqa + result.non_kv_cache_memory = ( + non_torch_memory + peak_activation_memory + result.weights_memory + ) # noqa # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501 def set_ulimit(target_soft_limit=65535): - if sys.platform.startswith('win'): + if sys.platform.startswith("win"): logger.info("Windows detected, skipping ulimit adjustment.") return import resource + resource_type = resource.RLIMIT_NOFILE current_soft, current_hard = resource.getrlimit(resource_type) if current_soft < target_soft_limit: try: - resource.setrlimit(resource_type, - (target_soft_limit, current_hard)) + resource.setrlimit(resource_type, (target_soft_limit, current_hard)) except ValueError as e: logger.warning( "Found ulimit of %s and failed to automatically increase " "with error %s. This can cause fd limit errors like " "`OSError: [Errno 24] Too many open files`. Consider " - "increasing with ulimit -n", current_soft, e) + "increasing with ulimit -n", + current_soft, + e, + ) # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/utils.py#L28 # noqa: E501 @@ -2941,11 +3007,7 @@ def zmq_socket_ctx( ctx = zmq.Context() # type: ignore[attr-defined] try: - yield make_zmq_socket(ctx, - path, - socket_type, - bind=bind, - identity=identity) + yield make_zmq_socket(ctx, path, socket_type, bind=bind, identity=identity) except KeyboardInterrupt: logger.debug("Got Keyboard Interrupt.") @@ -2966,6 +3028,7 @@ def _maybe_force_spawn(): # to the subprocess so that it knows how to connect to the ray cluster. # env vars are inherited by subprocesses, even if we use spawn. import ray + os.environ["RAY_ADDRESS"] = ray.get_runtime_context().gcs_address reasons.append("In a Ray actor and can only be spawned") @@ -2980,7 +3043,9 @@ def _maybe_force_spawn(): "Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " "See https://docs.vllm.ai/en/latest/usage/" "troubleshooting.html#python-multiprocessing " - "for more information. Reasons: %s", "; ".join(reasons)) + "for more information. Reasons: %s", + "; ".join(reasons), + ) os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" @@ -2999,7 +3064,7 @@ def get_mp_context(): def bind_kv_cache( ctx: dict[str, Any], kv_cache: list[list[torch.Tensor]], # [virtual_engine][layer_index] - shared_kv_cache_layers: Optional[dict[str, str]] = None + shared_kv_cache_layers: Optional[dict[str, str]] = None, ) -> None: # Bind the kv_cache tensor to Attention modules, similar to # ctx[layer_name].kv_cache[ve]=kv_cache[ve][extract_layer_index(layer_name)] @@ -3017,33 +3082,40 @@ def bind_kv_cache( shared_kv_cache_layers = {} from vllm.attention import AttentionType from vllm.model_executor.models.utils import extract_layer_index + layer_need_kv_cache = [ - layer_name for layer_name in ctx - if (hasattr(ctx[layer_name], 'attn_type') and ctx[layer_name].attn_type - in (AttentionType.DECODER, AttentionType.ENCODER_DECODER)) \ - and ctx[layer_name].kv_sharing_target_layer_name is None + layer_name + for layer_name in ctx + if ( + hasattr(ctx[layer_name], "attn_type") + and ctx[layer_name].attn_type + in (AttentionType.DECODER, AttentionType.ENCODER_DECODER) + ) + and ctx[layer_name].kv_sharing_target_layer_name is None ] layer_index_sorted = sorted( - set( - extract_layer_index(layer_name) - for layer_name in layer_need_kv_cache)) + set(extract_layer_index(layer_name) for layer_name in layer_need_kv_cache) + ) for layer_name in layer_need_kv_cache: - kv_cache_idx = layer_index_sorted.index( - extract_layer_index(layer_name)) + kv_cache_idx = layer_index_sorted.index(extract_layer_index(layer_name)) forward_ctx = ctx[layer_name] assert len(forward_ctx.kv_cache) == len(kv_cache) for ve, ve_kv_cache in enumerate(kv_cache): forward_ctx.kv_cache[ve] = ve_kv_cache[kv_cache_idx] if shared_kv_cache_layers is not None: for layer_name, target_layer_name in shared_kv_cache_layers.items(): - assert extract_layer_index(target_layer_name) < \ - extract_layer_index(layer_name), \ - "v0 doesn't support interleaving kv sharing" + assert extract_layer_index(target_layer_name) < extract_layer_index( + layer_name + ), "v0 doesn't support interleaving kv sharing" ctx[layer_name].kv_cache = ctx[target_layer_name].kv_cache -def run_method(obj: Any, method: Union[str, bytes, Callable], args: tuple[Any], - kwargs: dict[str, Any]) -> Any: +def run_method( + obj: Any, + method: Union[str, bytes, Callable], + args: tuple[Any], + kwargs: dict[str, Any], +) -> Any: """ Run a method of an object with the given arguments and keyword arguments. If the method is string, it will be converted to a method using getattr. @@ -3057,8 +3129,9 @@ def run_method(obj: Any, method: Union[str, bytes, Callable], args: tuple[Any], try: func = getattr(obj, method) except AttributeError: - raise NotImplementedError(f"Method {method!r} is not" - " implemented.") from None + raise NotImplementedError( + f"Method {method!r} is not implemented." + ) from None else: func = partial(method, obj) # type: ignore return func(*args, **kwargs) @@ -3092,6 +3165,7 @@ def import_pynvml(): module to our codebase, and use it directly. """ import vllm.third_party.pynvml as pynvml + return pynvml @@ -3111,7 +3185,7 @@ def find_unimplemented_methods(self: object): unimplemented_methods = [] for attr_name in dir(self): # bypass inner method - if attr_name.startswith('_'): + if attr_name.startswith("_"): continue try: @@ -3125,8 +3199,8 @@ def find_unimplemented_methods(self: object): if "NotImplementedError" in src: unimplemented_methods.append(attr_name) if unimplemented_methods: - method_names = ','.join(unimplemented_methods) - msg = (f"Methods {method_names} not implemented in {self}") + method_names = ",".join(unimplemented_methods) + msg = f"Methods {method_names} not implemented in {self}" logger.debug(msg) @wraps(original_init) @@ -3134,7 +3208,7 @@ def wrapped_init(self, *args, **kwargs) -> None: original_init(self, *args, **kwargs) find_unimplemented_methods(self) - type.__setattr__(cls, '__init__', wrapped_init) + type.__setattr__(cls, "__init__", wrapped_init) return cls @@ -3238,7 +3312,6 @@ def cprofile(save_file: Optional[str] = None, enabled: bool = True): """ def decorator(func: Callable): - @wraps(func) def wrapper(*args, **kwargs): if not enabled: @@ -3256,16 +3329,26 @@ def wrapper(*args, **kwargs): # Only relevant for models using ALiBi (e.g, MPT) def check_use_alibi(model_config: ModelConfig) -> bool: cfg = model_config.hf_text_config - return (getattr(cfg, "alibi", False) # Falcon - or ("BloomForCausalLM" in getattr(model_config.hf_config, - "architectures", [])) # Bloom - or getattr(cfg, "position_encoding_type", "") == - "alibi" # codellm_1b_alibi - or (hasattr(cfg, "attn_config") # MPT - and ((isinstance(cfg.attn_config, dict) - and cfg.attn_config.get("alibi", False)) or - (not isinstance(cfg.attn_config, dict) - and getattr(cfg.attn_config, "alibi", False))))) + return ( + getattr(cfg, "alibi", False) # Falcon + or ( + "BloomForCausalLM" in getattr(model_config.hf_config, "architectures", []) + ) # Bloom + or getattr(cfg, "position_encoding_type", "") == "alibi" # codellm_1b_alibi + or ( + hasattr(cfg, "attn_config") # MPT + and ( + ( + isinstance(cfg.attn_config, dict) + and cfg.attn_config.get("alibi", False) + ) + or ( + not isinstance(cfg.attn_config, dict) + and getattr(cfg.attn_config, "alibi", False) + ) + ) + ) + ) def sha256(input: Any) -> bytes: @@ -3333,7 +3416,7 @@ def is_torch_equal_or_newer(target: str) -> bool: return _is_torch_equal_or_newer(str(torch.__version__), target) except Exception: # Fallback to PKG-INFO to load the package info, needed by the doc gen. - return Version(importlib.metadata.version('torch')) >= Version(target) + return Version(importlib.metadata.version("torch")) >= Version(target) # Helper function used in testing. @@ -3376,9 +3459,9 @@ def has_triton_kernels() -> bool: return _has_module("triton_kernels") -def set_process_title(name: str, - suffix: str = "", - prefix: str = envs.VLLM_PROCESS_NAME_PREFIX) -> None: +def set_process_title( + name: str, suffix: str = "", prefix: str = envs.VLLM_PROCESS_NAME_PREFIX +) -> None: """ Set the current process title to a specific name with an optional suffix. @@ -3405,7 +3488,7 @@ def write_with_prefix(s: str): if file.start_new_line: # type: ignore[attr-defined] file_write(prefix) idx = 0 - while (next_idx := s.find('\n', idx)) != -1: + while (next_idx := s.find("\n", idx)) != -1: next_idx += 1 file_write(s[idx:next_idx]) if next_idx == len(s): @@ -3449,23 +3532,20 @@ def length_from_prompt_token_ids_or_embeds( """Calculate the request length (in number of tokens) give either prompt_token_ids or prompt_embeds. """ - prompt_token_len = None if prompt_token_ids is None else len( - prompt_token_ids) - prompt_embeds_len = \ - None if prompt_embeds is None else len(prompt_embeds) + prompt_token_len = None if prompt_token_ids is None else len(prompt_token_ids) + prompt_embeds_len = None if prompt_embeds is None else len(prompt_embeds) if prompt_token_len is None: if prompt_embeds_len is None: - raise ValueError( - "Neither prompt_token_ids nor prompt_embeds were defined.") + raise ValueError("Neither prompt_token_ids nor prompt_embeds were defined.") return prompt_embeds_len else: - if (prompt_embeds_len is not None - and prompt_embeds_len != prompt_token_len): + if prompt_embeds_len is not None and prompt_embeds_len != prompt_token_len: raise ValueError( "Prompt token ids and prompt embeds had different lengths" f" prompt_token_ids={prompt_token_len}" - f" prompt_embeds={prompt_embeds_len}") + f" prompt_embeds={prompt_embeds_len}" + ) return prompt_token_len diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 5200cbb322d5..f6bfcf90b792 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -22,32 +22,33 @@ from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer import AnyTokenizer -from vllm.utils import (GiB_bytes, MemorySnapshot, - length_from_prompt_token_ids_or_embeds, - memory_profiling) +from vllm.utils import ( + GiB_bytes, + MemorySnapshot, + length_from_prompt_token_ids_or_embeds, + memory_profiling, +) from vllm.v1.engine import EngineCoreRequest -from vllm.v1.structured_output.backend_guidance import ( - validate_guidance_grammar) +from vllm.v1.structured_output.backend_guidance import validate_guidance_grammar from vllm.v1.structured_output.backend_lm_format_enforcer import ( - validate_structured_output_request_lm_format_enforcer) + validate_structured_output_request_lm_format_enforcer, +) from vllm.v1.structured_output.backend_outlines import ( - validate_structured_output_request_outlines) -from vllm.v1.structured_output.backend_xgrammar import ( - validate_xgrammar_grammar) + validate_structured_output_request_outlines, +) +from vllm.v1.structured_output.backend_xgrammar import validate_xgrammar_grammar from vllm.v1.worker.utils import MultiModalBudget, check_enough_init_memory logger = init_logger(__name__) class Processor: - def __init__( self, vllm_config: VllmConfig, tokenizer: AnyTokenizer, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, ): - self.vllm_config = vllm_config self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config @@ -57,12 +58,10 @@ def __init__( self.structured_outputs_config = vllm_config.structured_outputs_config self.tokenizer = tokenizer - self.generation_config_fields = ( - self.model_config.try_get_generation_config()) + self.generation_config_fields = self.model_config.try_get_generation_config() self.mm_registry = mm_registry - self.mm_processor_cache = processor_cache_from_config( - vllm_config, mm_registry) + self.mm_processor_cache = processor_cache_from_config(vllm_config, mm_registry) self.input_preprocessor = InputPreprocessor( self.model_config, @@ -89,7 +88,8 @@ def _validate_logprobs( if num_logprobs > max_logprobs: raise ValueError( f"Requested sample logprobs of {num_logprobs}, " - f"which is is greater than max allowed: {max_logprobs}") + f"which is is greater than max allowed: {max_logprobs}" + ) # Validate prompt logprobs. if params.prompt_logprobs: @@ -99,7 +99,8 @@ def _validate_logprobs( if num_prompt_logprobs > max_logprobs: raise ValueError( f"Requested prompt logprobs of {num_prompt_logprobs}, " - f"which is is greater than max allowed: {max_logprobs}") + f"which is is greater than max allowed: {max_logprobs}" + ) def _validate_sampling_params( self, @@ -118,8 +119,7 @@ def _validate_sampling_params( return vocab_size = len(self.tokenizer) if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids): - raise ValueError( - "allowed_token_ids contains out-of-vocab token id!") + raise ValueError("allowed_token_ids contains out-of-vocab token id!") def _validate_logit_bias( self, @@ -139,7 +139,8 @@ def _validate_logit_bias( if invalid_token_ids: raise ValueError( f"token_id(s) {invalid_token_ids} in logit_bias contain " - f"out-of-vocab token ids. Vocabulary size: {vocab_size}") + f"out-of-vocab token ids. Vocabulary size: {vocab_size}" + ) def _validate_supported_sampling_params( self, @@ -150,8 +151,9 @@ def _validate_supported_sampling_params( raise ValueError("vLLM V1 does not yet support best_of.") # Logits processors not supported. if params.logits_processors: - raise ValueError("vLLM V1 does not support per request " - "user provided logits processors.") + raise ValueError( + "vLLM V1 does not support per request user provided logits processors." + ) def _validate_params( self, @@ -188,18 +190,23 @@ def _validate_single_prompt(single_prompt: Union[dict, str]) -> None: for modality, items in mm_data.items(): if modality in mm_uuids: data_len = len(items) if isinstance(items, list) else 1 - uuid_len = len(mm_uuids[modality]) if isinstance( - mm_uuids[modality], list) else 1 + uuid_len = ( + len(mm_uuids[modality]) + if isinstance(mm_uuids[modality], list) + else 1 + ) if uuid_len != data_len: raise ValueError( f"multi_modal_uuids for modality '{modality}' " "must have same length as data: got " f"{uuid_len} uuids vs " - f"{data_len} items.") + f"{data_len} items." + ) else: raise ValueError( f"multi_modal_uuids for modality '{modality}' must " - "be provided if multi_modal_data is provided.") + "be provided if multi_modal_data is provided." + ) # Handle explicit encoder/decoder prompts or singleton prompt if isinstance(prompt, dict) and "encoder_prompt" in prompt: @@ -218,8 +225,9 @@ def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None: # LoRA request passed in while LoRA is not enabled if not self.lora_config: - raise ValueError(f"Got lora_request {lora_request} but LoRA is " - "not enabled!") + raise ValueError( + f"Got lora_request {lora_request} but LoRA is not enabled!" + ) if self.tokenizer is not None: logger.warning_once( @@ -227,7 +235,8 @@ def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None: "tokenizers for different LoRAs. By default, vLLM uses base " "model's tokenizer. If you are using a LoRA " "with its own tokenizer, consider specifying `--tokenizer " - "[lora_path]` to use the LoRA tokenizer.") + "[lora_path]` to use the LoRA tokenizer." + ) def _validate_structured_output(self, params: SamplingParams) -> None: if not params.structured_outputs or not self.structured_outputs_config: @@ -245,20 +254,23 @@ def _validate_structured_output(self, params: SamplingParams) -> None: # to a specific backend based on `auto` behavior in a previous # request. We remember that it was set as a result of `auto` # using the `_backend_was_auto` field set in the params. - if (backend != _backend - and not (backend == "auto" - and params.structured_outputs._backend_was_auto)): + if backend != _backend and not ( + backend == "auto" and params.structured_outputs._backend_was_auto + ): raise ValueError( "Request-level structured output backend selection is not " f"supported. The request specified '{_backend}', but vLLM " f"was initialised with '{backend}'. This error can be " - "resolved by removing '_backend' from the request.") + "resolved by removing '_backend' from the request." + ) else: params.structured_outputs._backend = backend # Request content validation - if (isinstance(params.structured_outputs.choice, list) - and not params.structured_outputs.choice): + if ( + isinstance(params.structured_outputs.choice, list) + and not params.structured_outputs.choice + ): # It is invalid for choice to be an empty list raise ValueError( f"Choice '{params.structured_outputs.choice}' cannot be an empty list" # noqa: E501 @@ -328,9 +340,7 @@ def _extract_mm_data(p: PromptType): mm_uuids: MultiModalUUIDDict = {} for modality, data in mm_data.items(): n = len(data) if isinstance(data, list) else 1 - mm_uuids[modality] = [ - f"{request_id}-{modality}-{i}" for i in range(n) - ] + mm_uuids[modality] = [f"{request_id}-{modality}-{i}" for i in range(n)] return mm_uuids def process_inputs( @@ -345,16 +355,18 @@ def process_inputs( priority: int = 0, data_parallel_rank: Optional[int] = None, ) -> tuple[Optional[str], EngineCoreRequest]: - # TODO(woosuk): Support pooling models. self._validate_lora(lora_request) self._validate_params(params) data_parallel_size = self.vllm_config.parallel_config.data_parallel_size - if data_parallel_rank is not None and not (0 <= data_parallel_rank < - data_parallel_size): - raise ValueError(f"data_parallel_rank {data_parallel_rank} " - f"is out of range [0, {data_parallel_size}).") + if data_parallel_rank is not None and not ( + 0 <= data_parallel_rank < data_parallel_size + ): + raise ValueError( + f"data_parallel_rank {data_parallel_rank} " + f"is out of range [0, {data_parallel_size})." + ) if arrival_time is None: arrival_time = time.time() @@ -367,9 +379,11 @@ def process_inputs( # reused across requests, therefore identifying multimodal data items # by their content is no longer necessary, and we create uuids with # request id-modality-index as multimodal hash overrides. - if (self.model_config.multimodal_config and - self.model_config.multimodal_config.mm_processor_cache_gb == 0 - and not self.cache_config.enable_prefix_caching): + if ( + self.model_config.multimodal_config + and self.model_config.multimodal_config.mm_processor_cache_gb == 0 + and not self.cache_config.enable_prefix_caching + ): mm_uuids = self._maybe_build_mm_uuids(request_id, prompt) else: # Otherwise, use user-provided uuids as multimodal hash overrides @@ -390,6 +404,7 @@ def process_inputs( mm_uuids=mm_uuids, ) from vllm.platforms import current_platform + current_platform.validate_request( prompt=prompt, params=params, @@ -405,12 +420,19 @@ def process_inputs( # discriminated unions of TypedDicts, because of how it handles # inheritance of TypedDict. If we explicitly extract the items we want # we can avoid type errors from using `dict.get` later in the method. - prompt_str: Optional[str] = None if decoder_inputs[ - "type"] == "embeds" else decoder_inputs.get("prompt") - prompt_token_ids = decoder_inputs[ - "prompt_token_ids"] if decoder_inputs["type"] != "embeds" else None - prompt_embeds = decoder_inputs["prompt_embeds"] if decoder_inputs[ - "type"] == "embeds" else None + prompt_str: Optional[str] = ( + None if decoder_inputs["type"] == "embeds" else decoder_inputs.get("prompt") + ) + prompt_token_ids = ( + decoder_inputs["prompt_token_ids"] + if decoder_inputs["type"] != "embeds" + else None + ) + prompt_embeds = ( + decoder_inputs["prompt_embeds"] + if decoder_inputs["type"] == "embeds" + else None + ) sampling_params = None pooling_params = None @@ -420,11 +442,12 @@ def process_inputs( # If unset max tokens, then generate up to the max_model_len. if sampling_params.max_tokens is None: seq_len = length_from_prompt_token_ids_or_embeds( - prompt_token_ids, prompt_embeds) - sampling_params.max_tokens = \ - self.model_config.max_model_len - seq_len + prompt_token_ids, prompt_embeds + ) + sampling_params.max_tokens = self.model_config.max_model_len - seq_len sampling_params.update_from_generation_config( - self.generation_config_fields, eos_token_id) + self.generation_config_fields, eos_token_id + ) if self.tokenizer is not None: sampling_params.update_from_tokenizer(self.tokenizer) else: @@ -450,7 +473,9 @@ def process_inputs( data=decoder_mm_inputs[modality][idx], modality=modality, identifier=decoder_mm_hashes[modality][idx], - mm_position=decoder_mm_positions[modality][idx])) + mm_position=decoder_mm_positions[modality][idx], + ) + ) return prompt_str, EngineCoreRequest( request_id=request_id, @@ -484,12 +509,17 @@ def _validate_model_input( ): model_config = self.model_config - prompt_ids = None if prompt_inputs[ - "type"] == "embeds" else prompt_inputs["prompt_token_ids"] - prompt_embeds = prompt_inputs["prompt_embeds"] if prompt_inputs[ - "type"] == "embeds" else None - prompt_len = length_from_prompt_token_ids_or_embeds( - prompt_ids, prompt_embeds) + prompt_ids = ( + None + if prompt_inputs["type"] == "embeds" + else prompt_inputs["prompt_token_ids"] + ) + prompt_embeds = ( + prompt_inputs["prompt_embeds"] + if prompt_inputs["type"] == "embeds" + else None + ) + prompt_len = length_from_prompt_token_ids_or_embeds(prompt_ids, prompt_embeds) if not prompt_ids: if prompt_type == "encoder" and model_config.is_multimodal_model: pass # Mllama may have empty encoder inputs for text-only data @@ -514,10 +544,10 @@ def _validate_model_input( # Here we take the max of the two to determine if a token id is # truly out-of-vocabulary. - if max_input_id > max(tokenizer.max_token_id, - self.model_config.get_vocab_size() - 1): - raise ValueError( - f"Token id {max_input_id} is out of vocabulary") + if max_input_id > max( + tokenizer.max_token_id, self.model_config.get_vocab_size() - 1 + ): + raise ValueError(f"Token id {max_input_id} is out of vocabulary") max_prompt_len = self.model_config.max_model_len if prompt_len > max_prompt_len: @@ -537,16 +567,19 @@ def _validate_model_input( "Make sure that `max_model_len` is no smaller than the " "number of text tokens plus multimodal tokens. For image " "inputs, the number of image tokens depends on the number " - "of images, and possibly their aspect ratios as well.") + "of images, and possibly their aspect ratios as well." + ) else: suggestion = ( "Make sure that `max_model_len` is no smaller than the " - "number of text tokens.") + "number of text tokens." + ) raise ValueError( f"The {prompt_type} prompt (length {prompt_len}) is " f"longer than the maximum model length of {max_prompt_len}. " - f"{suggestion}") + f"{suggestion}" + ) # TODO: Find out how many placeholder tokens are there so we can # check that chunked prefill does not truncate them @@ -592,8 +625,9 @@ def profile_run(self) -> None: # Only check init memory if we are sure that the EngineCore is not # loading weights or running profiling on the same GPU new_device_index = torch.device(device).index or 0 - local_gpu_count = (parallel_config.data_parallel_size_local * - parallel_config.world_size) + local_gpu_count = ( + parallel_config.data_parallel_size_local * parallel_config.world_size + ) if new_device_index < local_gpu_count: logger.warning( "Both EngineCore and multi-modal processor are using " @@ -605,8 +639,10 @@ def profile_run(self) -> None: check_enough_init_memory(baseline_snapshot, self.cache_config) with memory_profiling(baseline_snapshot) as diff: - for modality, max_items_per_prompt in ( - mm_budget.max_items_per_prompt_by_modality.items()): + for ( + modality, + max_items_per_prompt, + ) in mm_budget.max_items_per_prompt_by_modality.items(): self.mm_registry.get_decoder_dummy_data( model_config=model_config, seq_len=scheduler_config.max_num_batched_tokens, @@ -622,10 +658,12 @@ def profile_run(self) -> None: device, ) if memory_usage > diff.before_profile.free_memory: - raise ValueError(f"Not enough memory in {device} " - f"for multi-modal processor. " - f"Try reducing `api_server_count` or " - f"revert to CPU processing.") + raise ValueError( + f"Not enough memory in {device} " + f"for multi-modal processor. " + f"Try reducing `api_server_count` or " + f"revert to CPU processing." + ) def clear_cache(self) -> None: self.input_preprocessor.clear_cache() diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index c6efc8b8c734..020334682772 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A GPU worker class.""" + import copy import gc import os @@ -13,9 +14,11 @@ import vllm.envs as envs from vllm.config import VllmConfig -from vllm.distributed import (ensure_model_parallel_initialized, - init_distributed_environment, - set_custom_all_reduce) +from vllm.distributed import ( + ensure_model_parallel_initialized, + init_distributed_environment, + set_custom_all_reduce, +) from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized from vllm.distributed.parallel_state import get_pp_group, get_tp_group from vllm.logger import init_logger @@ -28,8 +31,12 @@ from vllm.utils import GiB_bytes, MemorySnapshot, memory_profiling from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec -from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput, - DraftTokenIds, ModelRunnerOutput) +from vllm.v1.outputs import ( + EMPTY_MODEL_RUNNER_OUTPUT, + AsyncModelRunnerOutput, + DraftTokenIds, + ModelRunnerOutput, +) from vllm.v1.utils import report_usage_stats from vllm.v1.worker.gpu_model_runner import GPUModelRunner from vllm.v1.worker.utils import is_residual_scattered_for_sp @@ -45,7 +52,6 @@ class Worker(WorkerBase): - def __init__( self, vllm_config: VllmConfig, @@ -54,16 +60,18 @@ def __init__( distributed_init_method: str, is_driver_worker: bool = False, ): - - super().__init__(vllm_config=vllm_config, - local_rank=local_rank, - rank=rank, - distributed_init_method=distributed_init_method, - is_driver_worker=is_driver_worker) + super().__init__( + vllm_config=vllm_config, + local_rank=local_rank, + rank=rank, + distributed_init_method=distributed_init_method, + is_driver_worker=is_driver_worker, + ) if self.model_config.trust_remote_code: # note: lazy import to avoid importing torch before initializing from vllm.utils import init_cached_hf_modules + init_cached_hf_modules() # Buffers saved before sleep @@ -73,8 +81,10 @@ def __init__( # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace if envs.VLLM_TORCH_PROFILER_DIR: torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR - logger.info("Profiling enabled. Traces will be saved to: %s", - torch_profiler_trace_dir) + logger.info( + "Profiling enabled. Traces will be saved to: %s", + torch_profiler_trace_dir, + ) logger.debug( "Profiler config: record_shapes=%s," "profile_memory=%s,with_stack=%s,with_flops=%s", @@ -93,7 +103,9 @@ def __init__( with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS, on_trace_ready=torch.profiler.tensorboard_trace_handler( - torch_profiler_trace_dir, use_gzip=True)) + torch_profiler_trace_dir, use_gzip=True + ), + ) else: self.profiler = None @@ -106,20 +118,20 @@ def sleep(self, level: int = 1) -> None: if level == 2: model = self.model_runner.model self._sleep_saved_buffers = { - name: buffer.cpu().clone() - for name, buffer in model.named_buffers() + name: buffer.cpu().clone() for name, buffer in model.named_buffers() } allocator = CuMemAllocator.get_instance() - allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple()) + allocator.sleep(offload_tags=("weights",) if level == 1 else tuple()) free_bytes_after_sleep, total = torch.cuda.mem_get_info() freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep used_bytes = total - free_bytes_after_sleep assert freed_bytes >= 0, "Memory usage increased after sleeping." logger.info( - "Sleep mode freed %.2f GiB memory, " - "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes, - used_bytes / GiB_bytes) + "Sleep mode freed %.2f GiB memory, %.2f GiB memory is still in use.", + freed_bytes / GiB_bytes, + used_bytes / GiB_bytes, + ) def wake_up(self, tags: Optional[list[str]] = None) -> None: from vllm.device_allocator.cumem import CuMemAllocator @@ -135,23 +147,21 @@ def wake_up(self, tags: Optional[list[str]] = None) -> None: buffer.data.copy_(self._sleep_saved_buffers[name].data) self._sleep_saved_buffers = {} - def _maybe_get_memory_pool_context(self, - tag: str) -> AbstractContextManager: + def _maybe_get_memory_pool_context(self, tag: str) -> AbstractContextManager: if self.vllm_config.model_config.enable_sleep_mode: from vllm.device_allocator.cumem import CuMemAllocator allocator = CuMemAllocator.get_instance() if tag == "weights": assert allocator.get_current_usage() == 0, ( - "Sleep mode can only be " - "used for one instance per process.") + "Sleep mode can only be used for one instance per process." + ) context = allocator.use_memory_pool(tag=tag) else: context = nullcontext() return context - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks @@ -176,10 +186,13 @@ def init_device(self): # memory snapshot # This ensures NCCL buffers are allocated before we measure # available memory - init_worker_distributed_environment(self.vllm_config, self.rank, - self.distributed_init_method, - self.local_rank, - current_platform.dist_backend) + init_worker_distributed_environment( + self.vllm_config, + self.rank, + self.distributed_init_method, + self.local_rank, + current_platform.dist_backend, + ) # Set random seed. set_random_seed(self.model_config.seed) @@ -195,12 +208,12 @@ def init_device(self): self.cache_config, ) else: - raise RuntimeError( - f"Not support device type: {self.device_config.device}") + raise RuntimeError(f"Not support device type: {self.device_config.device}") # Construct the model runner self.model_runner: GPUModelRunner = GPUModelRunner( - self.vllm_config, self.device) + self.vllm_config, self.device + ) if self.rank == 0: # If usage stat is enabled, collect relevant info. @@ -248,7 +261,8 @@ def determine_available_memory(self) -> int: "size. If OOM'ed, check the difference of initial free " "memory between the current run and the previous run " "where kv_cache_memory_bytes is suggested and update it " - "correspondingly.") + "correspondingly." + ) logger.info(msg) return kv_cache_memory_bytes @@ -258,8 +272,8 @@ def determine_available_memory(self) -> int: # Execute a forward pass with dummy inputs to profile the memory usage # of the model. with memory_profiling( - self.init_snapshot, - weights_memory=int(self.model_runner.model_memory_usage), + self.init_snapshot, + weights_memory=int(self.model_runner.model_memory_usage), ) as profile_result: self.model_runner.profile_run() @@ -276,15 +290,15 @@ def determine_available_memory(self) -> int: "This happens when other processes sharing the same container " "release GPU memory while vLLM is profiling during initialization. " "To fix this, ensure consistent GPU memory allocation or " - "isolate vLLM in its own container.") - self.available_kv_cache_memory_bytes = self.requested_memory \ - - profile_result.non_kv_cache_memory + "isolate vLLM in its own container." + ) + self.available_kv_cache_memory_bytes = ( + self.requested_memory - profile_result.non_kv_cache_memory + ) - unrequested_memory = self.init_snapshot.free_memory \ - - self.requested_memory + unrequested_memory = self.init_snapshot.free_memory - self.requested_memory logger.debug( - "Initial free memory: %.2f GiB; " - "Requested memory: %.2f (util), %.2f GiB", + "Initial free memory: %.2f GiB; Requested memory: %.2f (util), %.2f GiB", GiB(self.init_snapshot.free_memory), self.cache_config.gpu_memory_utilization, GiB(self.requested_memory), @@ -296,8 +310,10 @@ def determine_available_memory(self) -> int: GiB(free_gpu_memory - unrequested_memory), ) logger.debug(profile_result) - logger.info("Available KV cache memory: %.2f GiB", - GiB(self.available_kv_cache_memory_bytes)) + logger.info( + "Available KV cache memory: %.2f GiB", + GiB(self.available_kv_cache_memory_bytes), + ) gc.collect() return int(self.available_kv_cache_memory_bytes) @@ -325,15 +341,14 @@ def compile_or_warm_up_model(self) -> None: warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy() if not self.model_config.enforce_eager: warmup_sizes = [ - x for x in warmup_sizes if x not in - self.vllm_config.compilation_config.cudagraph_capture_sizes + x + for x in warmup_sizes + if x not in self.vllm_config.compilation_config.cudagraph_capture_sizes ] # We skip EPLB here since we don't want to record dummy metrics for size in sorted(warmup_sizes, reverse=True): logger.info("Compile and warming up model for size %d", size) - self.model_runner._dummy_run(size, - skip_eplb=True, - remove_lora=False) + self.model_runner._dummy_run(size, skip_eplb=True, remove_lora=False) self.model_runner.maybe_remove_all_loras(self.model_runner.lora_config) # Warmup and tune the kernels used during model execution before @@ -344,8 +359,9 @@ def compile_or_warm_up_model(self) -> None: if not self.model_config.enforce_eager: cuda_graph_memory_bytes = self.model_runner.capture_model() - if (self.cache_config.kv_cache_memory_bytes is None - and hasattr(self, "peak_activation_memory")): + if self.cache_config.kv_cache_memory_bytes is None and hasattr( + self, "peak_activation_memory" + ): # Suggests optimal kv cache memory size if we rely on # memory_profiling to guess the kv cache memory size which # provides peak_activation_memory and a few other memory @@ -359,16 +375,22 @@ def compile_or_warm_up_model(self) -> None: # slightly underestimate the memory consumption. # So leave a small buffer (=150MiB) to avoid OOM. redundancy_buffer_memory = 150 * (1 << 20) - non_kv_cache_memory = (self.model_runner.model_memory_usage + - self.peak_activation_memory + - self.non_torch_memory + - cuda_graph_memory_bytes) + non_kv_cache_memory = ( + self.model_runner.model_memory_usage + + self.peak_activation_memory + + self.non_torch_memory + + cuda_graph_memory_bytes + ) kv_cache_memory_bytes_to_gpu_limit = ( - self.init_snapshot.free_memory - non_kv_cache_memory - - redundancy_buffer_memory) + self.init_snapshot.free_memory + - non_kv_cache_memory + - redundancy_buffer_memory + ) kv_cache_memory_bytes_to_requested_limit = ( - int(self.requested_memory) - non_kv_cache_memory - - redundancy_buffer_memory) + int(self.requested_memory) + - non_kv_cache_memory + - redundancy_buffer_memory + ) msg = ( f"Free memory on device " @@ -389,7 +411,8 @@ def compile_or_warm_up_model(self) -> None: f"{kv_cache_memory_bytes_to_gpu_limit}` " f"({GiB(kv_cache_memory_bytes_to_gpu_limit)} GiB) to fully " f"utilize gpu memory. Current kv cache memory in use is " - f"{GiB(self.available_kv_cache_memory_bytes)} GiB.") + f"{GiB(self.available_kv_cache_memory_bytes)} GiB." + ) logger.debug(msg) @@ -399,20 +422,20 @@ def compile_or_warm_up_model(self) -> None: # NOTE: This is called after `capture_model` on purpose to prevent # memory buffers from being cleared by `torch.cuda.empty_cache`. if get_pp_group().is_last_rank: - max_num_reqs = min(self.scheduler_config.max_num_seqs, - self.scheduler_config.max_num_batched_tokens) + max_num_reqs = min( + self.scheduler_config.max_num_seqs, + self.scheduler_config.max_num_batched_tokens, + ) # We skip EPLB here since we don't want to record dummy metrics - hidden_states, last_hidden_states = \ - self.model_runner._dummy_run( - num_tokens=max_num_reqs, - skip_eplb=True, - ) + hidden_states, last_hidden_states = self.model_runner._dummy_run( + num_tokens=max_num_reqs, + skip_eplb=True, + ) if self.model_runner.is_pooling_model: self.model_runner._dummy_pooler_run(hidden_states) else: - self.model_runner._dummy_sampler_run( - hidden_states=last_hidden_states) + self.model_runner._dummy_sampler_run(hidden_states=last_hidden_states) # Reset the seed to ensure that the random state is not affected by # the model initialization and profiling. @@ -432,32 +455,36 @@ def execute_model( intermediate_tensors = None forward_pass = scheduler_output.total_num_scheduled_tokens > 0 num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens - num_input_tokens = self.model_runner._get_num_input_tokens( - num_scheduled_tokens) + num_input_tokens = self.model_runner._get_num_input_tokens(num_scheduled_tokens) all_gather_tensors = { - "residual": - not is_residual_scattered_for_sp(self.vllm_config, - num_input_tokens) + "residual": not is_residual_scattered_for_sp( + self.vllm_config, num_input_tokens + ) } if forward_pass and not get_pp_group().is_first_rank: intermediate_tensors = IntermediateTensors( get_pp_group().recv_tensor_dict( all_gather_group=get_tp_group(), - all_gather_tensors=all_gather_tensors)) + all_gather_tensors=all_gather_tensors, + ) + ) - output = self.model_runner.execute_model(scheduler_output, - intermediate_tensors) + output = self.model_runner.execute_model(scheduler_output, intermediate_tensors) if isinstance(output, (ModelRunnerOutput, AsyncModelRunnerOutput)): return output assert isinstance(output, IntermediateTensors) parallel_config = self.vllm_config.parallel_config - assert parallel_config.distributed_executor_backend != ( - "external_launcher") and not get_pp_group().is_last_rank + assert ( + parallel_config.distributed_executor_backend != ("external_launcher") + and not get_pp_group().is_last_rank + ) - get_pp_group().send_tensor_dict(output.tensors, - all_gather_group=get_tp_group(), - all_gather_tensors=all_gather_tensors) + get_pp_group().send_tensor_dict( + output.tensors, + all_gather_group=get_tp_group(), + all_gather_tensors=all_gather_tensors, + ) kv_connector_output = output.kv_connector_output if not kv_connector_output: @@ -465,8 +492,10 @@ def execute_model( # In case of PP with kv transfer, we need to pass through the # kv_connector_output - if (not kv_connector_output.finished_sending - and not kv_connector_output.finished_recving): + if ( + not kv_connector_output.finished_sending + and not kv_connector_output.finished_recving + ): return EMPTY_MODEL_RUNNER_OUTPUT output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT) @@ -485,8 +514,9 @@ def profile(self, is_start: bool = True): self.profiler.stop() # only print profiler results on rank 0 if self.local_rank == 0: - print(self.profiler.key_averages().table( - sort_by="self_cuda_time_total")) + print( + self.profiler.key_averages().table(sort_by="self_cuda_time_total") + ) def execute_dummy_batch(self) -> None: self.model_runner._dummy_run(1, uniform_decode=True) @@ -507,68 +537,79 @@ def check_health(self) -> None: # worker will always be healthy as long as it's running. return - def _eplb_before_scale_down(self, old_ep_size: int, - new_ep_size: int) -> None: + def _eplb_before_scale_down(self, old_ep_size: int, new_ep_size: int) -> None: from vllm.distributed.parallel_state import get_ep_group + if get_ep_group().rank == 0: - logger.info("[Elastic EP] Starting expert resharding " - "before scaling down...") + logger.info( + "[Elastic EP] Starting expert resharding before scaling down..." + ) rank_mapping = { old_ep_rank: old_ep_rank if old_ep_rank < new_ep_size else -1 for old_ep_rank in range(old_ep_size) } assert self.model_runner.eplb_state is not None - self.model_runner.eplb_state.rearrange(self.model_runner.model, - execute_shuffle=True, - global_expert_load=None, - rank_mapping=rank_mapping) + self.model_runner.eplb_state.rearrange( + self.model_runner.model, + execute_shuffle=True, + global_expert_load=None, + rank_mapping=rank_mapping, + ) torch.cuda.synchronize() if get_ep_group().rank == 0: logger.info("[Elastic EP] Expert resharding completed!") def _eplb_after_scale_up( - self, old_ep_size: int, new_ep_size: int, - global_expert_load: Optional[torch.Tensor]) -> None: + self, + old_ep_size: int, + new_ep_size: int, + global_expert_load: Optional[torch.Tensor], + ) -> None: from vllm.distributed.parallel_state import get_ep_group + if get_ep_group().rank == 0: - logger.info("[Elastic EP] Starting expert resharding " - "after scaling up...") - rank_mapping = { - old_ep_rank: old_ep_rank - for old_ep_rank in range(old_ep_size) - } + logger.info("[Elastic EP] Starting expert resharding after scaling up...") + rank_mapping = {old_ep_rank: old_ep_rank for old_ep_rank in range(old_ep_size)} assert self.model_runner.eplb_state is not None self.model_runner.eplb_state.rearrange( self.model_runner.model, execute_shuffle=True, global_expert_load=global_expert_load, - rank_mapping=rank_mapping) + rank_mapping=rank_mapping, + ) if get_ep_group().rank == 0: logger.info("[Elastic EP] Expert resharding completed!") def _reconfigure_parallel_config( - self, reconfig_request: ReconfigureDistributedRequest) -> None: + self, reconfig_request: ReconfigureDistributedRequest + ) -> None: """ Update parallel config with provided reconfig_request """ parallel_config = self.vllm_config.parallel_config - parallel_config.data_parallel_size = \ - reconfig_request.new_data_parallel_size - if reconfig_request.new_data_parallel_rank != \ - ReconfigureRankType.KEEP_CURRENT_RANK: - parallel_config.data_parallel_rank = \ - reconfig_request.new_data_parallel_rank - if reconfig_request.new_data_parallel_rank_local != \ - ReconfigureRankType.KEEP_CURRENT_RANK: - parallel_config.data_parallel_rank_local = \ + parallel_config.data_parallel_size = reconfig_request.new_data_parallel_size + if ( + reconfig_request.new_data_parallel_rank + != ReconfigureRankType.KEEP_CURRENT_RANK + ): + parallel_config.data_parallel_rank = reconfig_request.new_data_parallel_rank + if ( + reconfig_request.new_data_parallel_rank_local + != ReconfigureRankType.KEEP_CURRENT_RANK + ): + parallel_config.data_parallel_rank_local = ( reconfig_request.new_data_parallel_rank_local - parallel_config.data_parallel_master_ip = \ + ) + parallel_config.data_parallel_master_ip = ( reconfig_request.new_data_parallel_master_ip - parallel_config.data_parallel_master_port = \ + ) + parallel_config.data_parallel_master_port = ( reconfig_request.new_data_parallel_master_port + ) - def _reconfigure_moe(self, old_ep_size: int, - new_ep_size: int) -> Optional[torch.Tensor]: + def _reconfigure_moe( + self, old_ep_size: int, new_ep_size: int + ) -> Optional[torch.Tensor]: """ Reconfigure MoE modules with provided reconfig_request @@ -576,20 +617,26 @@ def _reconfigure_moe(self, old_ep_size: int, otherwise None """ from vllm.distributed.parallel_state import ( - get_dp_group, get_ep_group, prepare_communication_buffer_for_model) - from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoEParallelConfig) + get_dp_group, + get_ep_group, + prepare_communication_buffer_for_model, + ) + from vllm.model_executor.layers.fused_moe.layer import FusedMoEParallelConfig parallel_config = self.vllm_config.parallel_config moe_modules = [ - module for module in self.model_runner.model.modules() - if (module.__class__.__name__ == "FusedMoE" - or module.__class__.__name__ == "SharedFusedMoE") + module + for module in self.model_runner.model.modules() + if ( + module.__class__.__name__ == "FusedMoE" + or module.__class__.__name__ == "SharedFusedMoE" + ) ] num_local_experts = moe_modules[0].moe_config.num_local_experts - assert all(module.moe_config.num_local_experts == num_local_experts - for module in moe_modules), ( - "All MoE modules must have the same number of experts") + assert all( + module.moe_config.num_local_experts == num_local_experts + for module in moe_modules + ), "All MoE modules must have the same number of experts" for module in moe_modules: module.moe_config.num_experts = num_local_experts * new_ep_size module.global_num_experts = module.moe_config.num_experts @@ -602,49 +649,62 @@ def _reconfigure_moe(self, old_ep_size: int, if new_ep_size < old_ep_size: num_local_physical_experts = num_local_experts assert self.model_runner.eplb_state is not None - new_physical_experts = \ + new_physical_experts = ( self.model_runner.eplb_state.physical_to_logical_map.shape[1] + ) parallel_config.eplb_config.num_redundant_experts = ( - new_physical_experts - - self.model_runner.eplb_state.logical_replica_count.shape[1]) + new_physical_experts + - self.model_runner.eplb_state.logical_replica_count.shape[1] + ) global_expert_load = None else: - num_local_physical_experts = torch.tensor([num_local_experts], - dtype=torch.int32, - device="cpu") - torch.distributed.broadcast(num_local_physical_experts, - group=get_ep_group().cpu_group, - group_src=0) + num_local_physical_experts = torch.tensor( + [num_local_experts], dtype=torch.int32, device="cpu" + ) + torch.distributed.broadcast( + num_local_physical_experts, group=get_ep_group().cpu_group, group_src=0 + ) num_local_physical_experts = num_local_physical_experts.item() new_physical_experts = num_local_physical_experts * new_ep_size assert self.model_runner.eplb_state is not None global_expert_load = self.model_runner.eplb_state.rearrange( - self.model_runner.model, execute_shuffle=False) + self.model_runner.model, execute_shuffle=False + ) parallel_config.eplb_config.num_redundant_experts = ( - new_physical_experts - global_expert_load.shape[1]) + new_physical_experts - global_expert_load.shape[1] + ) prepare_communication_buffer_for_model(self.model_runner.model) self.model_runner.model.update_physical_experts_metadata( num_physical_experts=new_physical_experts, - num_local_physical_experts=num_local_physical_experts) + num_local_physical_experts=num_local_physical_experts, + ) return global_expert_load def reinitialize_distributed( - self, reconfig_request: ReconfigureDistributedRequest) -> None: + self, reconfig_request: ReconfigureDistributedRequest + ) -> None: from vllm.config import set_current_vllm_config from vllm.distributed.parallel_state import ( - cleanup_dist_env_and_memory, get_ep_group) + cleanup_dist_env_and_memory, + get_ep_group, + ) old_ep_size = get_ep_group().world_size old_ep_rank = get_ep_group().rank - new_ep_size = reconfig_request.new_data_parallel_size * get_tp_group( - ).world_size * get_pp_group().world_size + new_ep_size = ( + reconfig_request.new_data_parallel_size + * get_tp_group().world_size + * get_pp_group().world_size + ) if new_ep_size < old_ep_size: self._eplb_before_scale_down(old_ep_size, new_ep_size) cleanup_dist_env_and_memory() - if reconfig_request.new_data_parallel_rank == \ - ReconfigureRankType.SHUTDOWN_CURRENT_RANK: + if ( + reconfig_request.new_data_parallel_rank + == ReconfigureRankType.SHUTDOWN_CURRENT_RANK + ): assert old_ep_rank >= new_ep_size # shutdown return @@ -652,16 +712,18 @@ def reinitialize_distributed( self._reconfigure_parallel_config(reconfig_request) with set_current_vllm_config(self.vllm_config): - init_worker_distributed_environment(self.vllm_config, self.rank, - self.distributed_init_method, - self.local_rank) + init_worker_distributed_environment( + self.vllm_config, + self.rank, + self.distributed_init_method, + self.local_rank, + ) global_expert_load = self._reconfigure_moe(old_ep_size, new_ep_size) if new_ep_size > old_ep_size: assert global_expert_load is not None - self._eplb_after_scale_up(old_ep_size, new_ep_size, - global_expert_load) + self._eplb_after_scale_up(old_ep_size, new_ep_size, global_expert_load) def save_sharded_state( self, @@ -670,6 +732,7 @@ def save_sharded_state( max_size: Optional[int] = None, ) -> None: from vllm.model_executor.model_loader import ShardedStateLoader + ShardedStateLoader.save_model( self.model_runner.model, path, @@ -682,7 +745,8 @@ def save_tensorized_model( tensorizer_config: "TensorizerConfig", ) -> None: self.model_runner.save_tensorized_model( - tensorizer_config=tensorizer_config, ) + tensorizer_config=tensorizer_config, + ) def shutdown(self) -> None: if runner := getattr(self, "model_runner", None): @@ -700,12 +764,14 @@ def init_worker_distributed_environment( parallel_config = vllm_config.parallel_config set_custom_all_reduce(not parallel_config.disable_custom_all_reduce) - init_distributed_environment(parallel_config.world_size, rank, - distributed_init_method, local_rank, backend) + init_distributed_environment( + parallel_config.world_size, rank, distributed_init_method, local_rank, backend + ) ensure_model_parallel_initialized( parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size, - parallel_config.decode_context_parallel_size) + parallel_config.decode_context_parallel_size, + ) ensure_kv_transfer_initialized(vllm_config) diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 7f6b17e03671..7e198461cac2 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -36,18 +36,18 @@ def __init__( self.model_config = model_config self.scheduler_config = scheduler_config self.mm_registry = mm_registry - self.cache = cache = processor_only_cache_from_config( - model_config, mm_registry) + self.cache = cache = processor_only_cache_from_config(model_config, mm_registry) self.max_model_len = model_config.max_model_len self.max_num_reqs = scheduler_config.max_num_seqs - self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config, - cache=cache) + self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config, cache=cache) - max_tokens_by_modality = mm_registry \ - .get_max_tokens_per_item_by_nonzero_modality(model_config, - cache=cache) + max_tokens_by_modality = ( + mm_registry.get_max_tokens_per_item_by_nonzero_modality( + model_config, cache=cache + ) + ) encoder_compute_budget, encoder_cache_size = compute_mm_encoder_budget( scheduler_config, @@ -146,17 +146,14 @@ def create_with_metadata_builders( vllm_config: VllmConfig, device: torch.device, num_metadata_builders: int = 1, - ) -> 'AttentionGroup': + ) -> "AttentionGroup": metadata_builders = [ - backend.get_builder_cls()(kv_cache_spec, layer_names, vllm_config, - device) + backend.get_builder_cls()(kv_cache_spec, layer_names, vllm_config, device) for _ in range(num_metadata_builders) ] - return AttentionGroup(backend, metadata_builders, layer_names, - kv_cache_spec) + return AttentionGroup(backend, metadata_builders, layer_names, kv_cache_spec) - def get_metadata_builder(self, - ubatch_id: int = 0) -> AttentionMetadataBuilder: + def get_metadata_builder(self, ubatch_id: int = 0) -> AttentionMetadataBuilder: assert len(self.metadata_builders) > ubatch_id return self.metadata_builders[ubatch_id] @@ -173,19 +170,22 @@ def sanity_check_mm_encoder_outputs( "Expected multimodal embeddings to be a list/tuple of 2D tensors, " f"or a single 3D tensor, but got {type(mm_embeddings)} " "instead. This is most likely due to incorrect implementation " - "of the model's `get_multimodal_embeddings` method.") + "of the model's `get_multimodal_embeddings` method." + ) assert len(mm_embeddings) == expected_num_items, ( "Expected number of multimodal embeddings to match number of " f"input items: {expected_num_items}, but got {len(mm_embeddings)=} " "instead. This is most likely due to incorrect implementation " - "of the model's `get_multimodal_embeddings` method.") + "of the model's `get_multimodal_embeddings` method." + ) assert all(e.ndim == 2 for e in mm_embeddings), ( "Expected multimodal embeddings to be a sequence of 2D tensors, " f"but got tensors with shapes {[e.shape for e in mm_embeddings]} " "instead. This is most likely due to incorrect implementation " - "of the model's `get_multimodal_embeddings` method.") + "of the model's `get_multimodal_embeddings` method." + ) def scatter_mm_placeholders( @@ -241,7 +241,8 @@ def check_enough_init_memory( that the current amount of free memory is sufficient for that. """ requested_memory = init_snapshot.total_memory * ( - cache_config.gpu_memory_utilization) + cache_config.gpu_memory_utilization + ) if init_snapshot.free_memory < requested_memory: GiB = lambda b: round(b / GiB_bytes, 2) @@ -252,7 +253,8 @@ def check_enough_init_memory( f"is less than desired GPU memory utilization " f"({cache_config.gpu_memory_utilization}, " f"{GiB(requested_memory)} GiB). Decrease GPU memory " - f"utilization or reduce GPU memory used by other processes.") + f"utilization or reduce GPU memory used by other processes." + ) return requested_memory @@ -343,16 +345,16 @@ def bind_kv_cache( forward_context[layer_name].kv_cache = [kv_cache] -def is_residual_scattered_for_sp(vllm_config: VllmConfig, - num_input_tokens: int) -> bool: +def is_residual_scattered_for_sp( + vllm_config: VllmConfig, num_input_tokens: int +) -> bool: """Check if the residual tensor is scattered for sequence parallelism. The residual tensor is scattered across tensor parallel ranks when sequence parallelism and tensor parallelism is enabled, and the number of input tokens is one of the compilation sizes. """ - if not vllm_config.compilation_config.pass_config.\ - enable_sequence_parallelism: + if not vllm_config.compilation_config.pass_config.enable_sequence_parallelism: return False tp = vllm_config.parallel_config.tensor_parallel_size @@ -365,4 +367,4 @@ def is_residual_scattered_for_sp(vllm_config: VllmConfig, assert num_input_tokens % tp == 0 # Currently, SP is only enabled for static size fx graphs. - return (num_input_tokens in vllm_config.compilation_config.compile_sizes) + return num_input_tokens in vllm_config.compilation_config.compile_sizes From 7b7721ec39c15f7456561163c3d1bdb6c7ef26f6 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 6 Oct 2025 03:22:44 +0000 Subject: [PATCH 116/130] Update Signed-off-by: DarkLight1337 --- vllm/v1/worker/utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index a23900f8bdac..7350d53a7cdd 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -240,9 +240,7 @@ def check_enough_init_memory( Calculate the amount of memory required by vLLM, then validate that the current amount of free memory is sufficient for that. """ - requested_memory = init_snapshot.total_memory * ( - cache_config.gpu_memory_utilization - ) + requested_memory = init_snapshot.total_memory * cache_config.gpu_memory_utilization if init_snapshot.free_memory < requested_memory: GiB = lambda b: round(b / GiB_bytes, 2) From ceedc5116d2b5cb662c18bed13d20d826e1b8ec5 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 13 Oct 2025 05:37:13 +0000 Subject: [PATCH 117/130] ruff Signed-off-by: DarkLight1337 --- tests/entrypoints/openai/test_audio.py | 3 +-- tests/entrypoints/openai/test_vision.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index 05b135d8cf64..a2d8993441fc 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json -from typing import Union import openai import pytest @@ -55,7 +54,7 @@ def base64_encoded_audio() -> dict[str, str]: def dummy_messages_from_audio_url( - audio_urls: Union[str, list[str]], + audio_urls: str | list[str], content_text: str = "What's happening in this audio?", ): if isinstance(audio_urls, str): diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 2a6c0920471b..09bd0dabb799 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json -from typing import Union import openai import pytest @@ -80,7 +79,7 @@ def base64_encoded_image(local_asset_server) -> dict[str, str]: def dummy_messages_from_image_url( - image_urls: Union[str, list[str]], + image_urls: str | list[str], content_text: str = "What's in this image?", ): if isinstance(image_urls, str): From a93169430ac42754cdac414a9ec72f5e7ce0c52c Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 15 Oct 2025 03:51:57 +0000 Subject: [PATCH 118/130] ruff format Signed-off-by: DarkLight1337 --- docs/configuration/optimization.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index a33fe927ca42..d75ec50a989d 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -231,12 +231,16 @@ To run Hugging Face processors on the GPU, you can pass the `device` argument ```python # Fast image processor requires use_fast=True -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - mm_processor_kwargs={"use_fast": True, "device": "cuda"}) +llm = LLM( + model="Qwen/Qwen2.5-VL-3B-Instruct", + mm_processor_kwargs={"use_fast": True, "device": "cuda"}, +) # Whisper feature extractor does not require use_fast -llm = LLM(model="Qwen/Qwen2-Audio-7B-Instruct", - mm_processor_kwargs={"device": "cuda"}) +llm = LLM( + model="Qwen/Qwen2-Audio-7B-Instruct", + mm_processor_kwargs={"device": "cuda"}, +) ``` !!! note From 8acf7eac5d76045fdcb758661a51b9527665307c Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 15 Oct 2025 07:19:32 +0000 Subject: [PATCH 119/130] No yapf Signed-off-by: DarkLight1337 --- tests/multimodal/test_utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index d86d8185efa4..36d9e1b92f24 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -237,7 +237,6 @@ async def test_fetch_video_http_with_dynamic_loader( assert metadata_sync["video_backend"] == "opencv_dynamic" -# yapf: disable @pytest.mark.parametrize( "case", [ @@ -332,7 +331,6 @@ async def test_fetch_video_http_with_dynamic_loader( ), ], ) -# yapf: enable def test_allocate_gpu_mm_processors(case): mm_processor_device = case["mm_processor_device"] mm_processor_count = case["mm_processor_count"] @@ -350,7 +348,6 @@ def test_allocate_gpu_mm_processors(case): assert gpu_allocation == expected_gpu_allocation -# yapf: disable @pytest.mark.parametrize( "case", [ From 84770b18f495955834b63fce81afe2e044be3857 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 15 Dec 2025 06:04:46 +0000 Subject: [PATCH 120/130] Fix Signed-off-by: DarkLight1337 --- docs/mkdocs/hooks/generate_argparse.py | 8 ++++---- vllm/v1/engine/input_processor.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py index 4ae64a6e4bfc..40f37a2048d6 100644 --- a/docs/mkdocs/hooks/generate_argparse.py +++ b/docs/mkdocs/hooks/generate_argparse.py @@ -27,6 +27,10 @@ def mock_if_no_torch(mock_module: str, mock: MagicMock): sys.modules[mock_module] = mock +# Make torch.nn.Parameter safe to inherit from +mock_if_no_torch("torch.nn", MagicMock(Parameter=object)) + + # Mock custom op code class MockCustomOp: @staticmethod @@ -50,10 +54,6 @@ def decorator(cls): importlib.metadata.version = lambda name: VERSIONS.get(name) or "0.0.0" -# Make torch.nn.Parameter safe to inherit from -mock_if_no_torch("torch.nn", MagicMock(Parameter=object)) - - class PydanticMagicMock(MagicMock): """`MagicMock` that's able to generate pydantic-core schemas.""" diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index 3ea270c35688..9bd46c158eff 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -5,8 +5,6 @@ from collections.abc import Mapping from typing import Any, Literal, cast -import torch - from vllm.config import VllmConfig from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs from vllm.inputs.parse import split_enc_dec_inputs @@ -682,6 +680,8 @@ def profile_run(self) -> None: baseline_snapshot = MemorySnapshot(device=device) + import torch + # Only check init memory if we are sure that the EngineCore is not # loading weights or running profiling on the same GPU new_device_index = torch.device(device).index or 0 From 1fcbf47f142432ceedc603bffb7f0e63d007e02e Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 15 Dec 2025 06:10:05 +0000 Subject: [PATCH 121/130] Try fix docs Signed-off-by: DarkLight1337 --- docs/mkdocs/hooks/generate_argparse.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py index 40f37a2048d6..a1eaed726f38 100644 --- a/docs/mkdocs/hooks/generate_argparse.py +++ b/docs/mkdocs/hooks/generate_argparse.py @@ -27,8 +27,12 @@ def mock_if_no_torch(mock_module: str, mock: MagicMock): sys.modules[mock_module] = mock -# Make torch.nn.Parameter safe to inherit from +# Make these classes safe to inherit from mock_if_no_torch("torch.nn", MagicMock(Parameter=object)) +mock_if_no_torch( + "vllm.model_executor.layers.attention_layer_base", + MagicMock(AttentionLayerBase=object), +) # Mock custom op code From 77bd9fe75174429cc222ff526405622bc152d127 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 15 Dec 2025 06:19:04 +0000 Subject: [PATCH 122/130] Try Signed-off-by: DarkLight1337 --- docs/mkdocs/hooks/generate_argparse.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py index a1eaed726f38..2da776ff73a4 100644 --- a/docs/mkdocs/hooks/generate_argparse.py +++ b/docs/mkdocs/hooks/generate_argparse.py @@ -29,6 +29,11 @@ def mock_if_no_torch(mock_module: str, mock: MagicMock): # Make these classes safe to inherit from mock_if_no_torch("torch.nn", MagicMock(Parameter=object)) +mock_if_no_torch( + "vllm.attention.layer", + MagicMock(Attention=object), +) + mock_if_no_torch( "vllm.model_executor.layers.attention_layer_base", MagicMock(AttentionLayerBase=object), From d3e213fbf0219dd6bc8bf4624c97a40f8f906f28 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 15 Dec 2025 12:58:20 +0000 Subject: [PATCH 123/130] Bad import Signed-off-by: DarkLight1337 --- vllm/v1/worker/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 0820946b262a..4d9a77eb4071 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -13,7 +13,8 @@ from vllm.multimodal.cache import processor_only_cache_from_config from vllm.multimodal.registry import MultiModalRegistry from vllm.platforms import current_platform -from vllm.utils import GiB_bytes, MemorySnapshot +from vllm.utils.mem_constants import GiB_bytes +from vllm.utils.mem_utils import MemorySnapshot from vllm.v1.attention.backends.utils import AttentionMetadataBuilder from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec From 93b721d9766f5753febfe7294f2588f58cfb3e11 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 17 Dec 2025 15:35:46 +0000 Subject: [PATCH 124/130] Test Signed-off-by: DarkLight1337 --- docs/mkdocs/hooks/generate_argparse.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py index 2da776ff73a4..a1eaed726f38 100644 --- a/docs/mkdocs/hooks/generate_argparse.py +++ b/docs/mkdocs/hooks/generate_argparse.py @@ -29,11 +29,6 @@ def mock_if_no_torch(mock_module: str, mock: MagicMock): # Make these classes safe to inherit from mock_if_no_torch("torch.nn", MagicMock(Parameter=object)) -mock_if_no_torch( - "vllm.attention.layer", - MagicMock(Attention=object), -) - mock_if_no_torch( "vllm.model_executor.layers.attention_layer_base", MagicMock(AttentionLayerBase=object), From ff96ad4cda38d458504f57a610528bcf7c5d8aab Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 17 Dec 2025 15:49:57 +0000 Subject: [PATCH 125/130] Fix Signed-off-by: DarkLight1337 --- vllm/v1/engine/input_processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index 9bd46c158eff..16eccc630db0 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -35,7 +35,7 @@ validate_structured_output_request_outlines, ) from vllm.v1.structured_output.backend_xgrammar import validate_xgrammar_grammar -from vllm.v1.worker.utils import MultiModalBudget, check_enough_init_memory +from vllm.v1.worker.utils import MultiModalBudget, request_memory logger = init_logger(__name__) @@ -696,7 +696,7 @@ def profile_run(self) -> None: device, ) else: - check_enough_init_memory(baseline_snapshot, self.cache_config) + request_memory(baseline_snapshot, self.cache_config) with memory_profiling(baseline_snapshot) as diff: for ( From db55e110f46f0002345192be1b73c7cfd6cee095 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 17 Dec 2025 15:51:18 +0000 Subject: [PATCH 126/130] Simplify Signed-off-by: DarkLight1337 --- vllm/v1/engine/input_processor.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index 16eccc630db0..ec4f7960b673 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -679,12 +679,11 @@ def profile_run(self) -> None: ) baseline_snapshot = MemorySnapshot(device=device) - - import torch + device_ = baseline_snapshot.device_ # Only check init memory if we are sure that the EngineCore is not # loading weights or running profiling on the same GPU - new_device_index = torch.device(device).index or 0 + new_device_index = device_.index local_gpu_count = ( parallel_config.data_parallel_size_local * parallel_config.world_size ) @@ -693,7 +692,7 @@ def profile_run(self) -> None: "Both EngineCore and multi-modal processor are using " "the same GPU (%s). This may result in inaccurate memory " "profiling, and resource contention during inference.", - device, + device_, ) else: request_memory(baseline_snapshot, self.cache_config) @@ -715,11 +714,11 @@ def profile_run(self) -> None: "Multi-modal processing took %.4f GiB and %.6f seconds on %s", memory_usage / GiB_bytes, diff.profile_time, - device, + device_, ) if memory_usage > diff.before_profile.free_memory: raise ValueError( - f"Not enough memory in {device} for multi-modal processor. " + f"Not enough memory in {device_} for multi-modal processor. " f"Try reducing `api_server_count` or revert to CPU processing." ) From 46f9a72164f692d52fef34555a08351f07db1833 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 18 Dec 2025 07:02:06 +0000 Subject: [PATCH 127/130] Doc fix Signed-off-by: DarkLight1337 --- docs/mkdocs/hooks/generate_argparse.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py index a1eaed726f38..cdf11b07f825 100644 --- a/docs/mkdocs/hooks/generate_argparse.py +++ b/docs/mkdocs/hooks/generate_argparse.py @@ -29,6 +29,7 @@ def mock_if_no_torch(mock_module: str, mock: MagicMock): # Make these classes safe to inherit from mock_if_no_torch("torch.nn", MagicMock(Parameter=object)) +mock_if_no_torch("vllm.attention.layer", MagicMock(Attention=object)) mock_if_no_torch( "vllm.model_executor.layers.attention_layer_base", MagicMock(AttentionLayerBase=object), From e84e760ca92970ad8697e7c94b7884bd134b933e Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 11 Feb 2026 04:09:33 +0000 Subject: [PATCH 128/130] Reduce diff Signed-off-by: DarkLight1337 --- docs/mkdocs/hooks/generate_argparse.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py index 8e8e2792b080..801cc8a05d15 100644 --- a/docs/mkdocs/hooks/generate_argparse.py +++ b/docs/mkdocs/hooks/generate_argparse.py @@ -28,15 +28,6 @@ def mock_if_no_torch(mock_module: str, mock: MagicMock): sys.modules[mock_module] = mock -# Make these classes safe to inherit from -mock_if_no_torch("torch.nn", MagicMock(Parameter=object)) -mock_if_no_torch("vllm.attention.layer", MagicMock(Attention=object)) -mock_if_no_torch( - "vllm.model_executor.layers.attention_layer_base", - MagicMock(AttentionLayerBase=object), -) - - # Mock custom op code class MockCustomOp: @staticmethod @@ -60,6 +51,10 @@ def decorator(cls): importlib.metadata.version = lambda name: VERSIONS.get(name) or "0.0.0" +# Make torch.nn.Parameter safe to inherit from +mock_if_no_torch("torch.nn", MagicMock(Parameter=object)) + + class PydanticMagicMock(MagicMock): """`MagicMock` that's able to generate pydantic-core schemas.""" From 0d161e443fc327fa41e8c9df2db19ef5323b2aa9 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 11 Feb 2026 04:17:44 +0000 Subject: [PATCH 129/130] Fix mypy Signed-off-by: DarkLight1337 --- vllm/v1/engine/input_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index 1d6c2301cb71..d51e2b576322 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -79,7 +79,7 @@ def __init__( self.supports_mm_inputs = mm_registry.supports_multimodal_inputs(model_config) self.mm_encoder_cache_size = 0 - self.mm_max_items_per_prompt = dict[str, int]() + self.mm_max_items_per_prompt: Mapping[str, int] = {} self.skip_prompt_length_check = False if self.supports_mm_inputs: mm_budget = MultiModalBudget(vllm_config, mm_registry) From e304e79157c13c4bdd0afd0c6e6fd6f69ac902ce Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 23 Mar 2026 03:27:29 +0000 Subject: [PATCH 130/130] Fix Signed-off-by: DarkLight1337 --- tests/entrypoints/llm/test_chat.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py index 66fcd83e91a1..86395850e74f 100644 --- a/tests/entrypoints/llm/test_chat.py +++ b/tests/entrypoints/llm/test_chat.py @@ -4,15 +4,19 @@ import pytest -from tests.entrypoints.openai.chat_completion.test_vision import TEST_IMAGE_ASSETS +from tests.entrypoints.openai.chat_completion.test_audio import ( + TEST_AUDIO_URLS, + dummy_messages_from_audio_url, +) +from tests.entrypoints.openai.chat_completion.test_vision import ( + TEST_IMAGE_ASSETS, + dummy_messages_from_image_url, +) from vllm import LLM from vllm.distributed import cleanup_dist_env_and_memory from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams -from ..openai.test_audio import TEST_AUDIO_URLS, dummy_messages_from_audio_url -from ..openai.test_vision import TEST_IMAGE_ASSETS, dummy_messages_from_image_url - @pytest.fixture(scope="function") def text_llm():