From c9388c7446d8f05f0bfe6ab3ad2d7f4815223244 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Mon, 29 Sep 2025 14:09:18 +0800 Subject: [PATCH 001/193] [V0 Deprecation][Models] Remove all V0 condition for mm embeddings merge (#25331) Signed-off-by: Isotr0py Signed-off-by: isotr0py <2037008807@qq.com> Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> --- vllm/model_executor/models/aya_vision.py | 11 --- vllm/model_executor/models/blip2.py | 11 --- vllm/model_executor/models/chameleon.py | 12 --- vllm/model_executor/models/cohere2_vision.py | 11 --- vllm/model_executor/models/deepseek_vl2.py | 11 --- vllm/model_executor/models/fuyu.py | 11 --- vllm/model_executor/models/gemma3_mm.py | 19 ---- vllm/model_executor/models/glm4_1v.py | 50 +--------- vllm/model_executor/models/glm4v.py | 17 +--- vllm/model_executor/models/granite_speech.py | 11 --- .../models/hyperclovax_vision.py | 15 +-- vllm/model_executor/models/idefics3.py | 11 --- vllm/model_executor/models/interns1.py | 18 +--- vllm/model_executor/models/internvl.py | 18 +--- vllm/model_executor/models/kimi_vl.py | 16 ---- vllm/model_executor/models/llava.py | 11 --- vllm/model_executor/models/llava_next.py | 11 --- .../model_executor/models/llava_next_video.py | 11 --- vllm/model_executor/models/llava_onevision.py | 46 +-------- vllm/model_executor/models/minicpmv.py | 15 +-- vllm/model_executor/models/mistral3.py | 11 --- vllm/model_executor/models/mllama4.py | 11 --- vllm/model_executor/models/molmo.py | 11 --- .../model_executor/models/nano_nemotron_vl.py | 18 +--- vllm/model_executor/models/nemotron_vl.py | 11 --- vllm/model_executor/models/ovis.py | 11 --- vllm/model_executor/models/ovis2_5.py | 12 --- vllm/model_executor/models/paligemma.py | 11 --- vllm/model_executor/models/phi3v.py | 11 --- vllm/model_executor/models/phi4_multimodal.py | 16 ---- vllm/model_executor/models/phi4mm.py | 48 +--------- vllm/model_executor/models/pixtral.py | 11 --- .../models/qwen2_5_omni_thinker.py | 31 +------ vllm/model_executor/models/qwen2_5_vl.py | 58 +----------- vllm/model_executor/models/qwen2_audio.py | 11 --- vllm/model_executor/models/qwen2_vl.py | 50 +--------- vllm/model_executor/models/qwen3_vl.py | 93 +------------------ vllm/model_executor/models/qwen_vl.py | 12 --- vllm/model_executor/models/skyworkr1v.py | 11 --- vllm/model_executor/models/transformers.py | 13 --- vllm/model_executor/models/ultravox.py | 12 --- vllm/model_executor/models/voxtral.py | 13 --- 42 files changed, 13 insertions(+), 809 deletions(-) diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index eab996e9ba22..f6dfa435ddd4 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -427,17 +427,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.config.image_token_index, - ) - input_ids = None - hidden_states = self.language_model.model( input_ids=input_ids, positions=positions, diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 4d1850d07b28..334743a7358c 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -672,17 +672,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == _IMAGE_TOKEN_ID, - ) - input_ids = None - hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index f9740adb151b..86dbf63fa5df 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -1014,18 +1014,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - image_token_id = self.model.vocabulary_mapping.image_token_id - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == image_token_id, - ) - input_ids = None - hidden_states = self.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index 99edcba4d874..7162571c08d9 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -440,17 +440,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.config.image_token_id, - ) - input_ids = None - hidden_states = self.language_model.model( input_ids=input_ids, positions=positions, diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index b98008c83bdc..0f87fb34bf32 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -614,17 +614,6 @@ def forward(self, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.image_token_id, - ) - input_ids = None - hidden_states = self.language_model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index b99fe33a1dcc..9e491c0b50d2 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -352,17 +352,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == _IMAGE_TOKEN_ID, - ) - input_ids = None - hidden_states = self.language_model( input_ids=input_ids, positions=positions, diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index be75e36fe23b..36f8651371ba 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -596,25 +596,6 @@ def forward(self, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.config.image_token_index, - ) - if (vision_embeddings is not None) and len(vision_embeddings) != 0: - kwargs = self.prepare_attn_masks( - input_ids, - positions, - mask_dtype=self.dtype, - **kwargs, - ) - input_ids = None - hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index dbb5431ae491..722f1e428be7 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -71,7 +71,6 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.config import uses_mrope from vllm.utils.tensor_schema import TensorSchema, TensorShape from ..layers.activation import SiluAndMul @@ -80,8 +79,7 @@ from .qwen2_vl import (_create_qwen2vl_field_factory, apply_rotary_pos_emb_vision) from .utils import (AutoWeightsLoader, WeightsMapper, - init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) + init_vllm_registered_model, maybe_prefix) from .vision import get_vit_attn_backend, run_dp_sharded_mrope_vision_model logger = init_logger(__name__) @@ -1552,32 +1550,6 @@ def get_multimodal_embeddings( multimodal_embeddings += video_embeddings return multimodal_embeddings - def get_input_embeddings_v0( - self, - input_ids: torch.Tensor, - image_input: Optional[Glm4vImageInputs] = None, - video_input: Optional[Glm4vVideoInputs] = None, - ) -> torch.Tensor: - inputs_embeds = self.get_input_embeddings(input_ids) - if image_input is not None: - image_embeds = self._process_image_input(image_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - image_embeds, - placeholder_token_id=self.config.image_token_id, - ) - - if video_input is not None: - video_embeds = self._process_video_input(video_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - video_embeds, - placeholder_token_id=self.config.video_token_id, - ) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -1604,26 +1576,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner from - # `get_multimodal_embeddings` and `get_input_embeddings`, this - # condition is only for v0 compatibility. - elif inputs_embeds is None: - image_input = self._parse_and_validate_image_input(**kwargs) - video_input = self._parse_and_validate_video_input(**kwargs) - - if image_input is None and video_input is None: - inputs_embeds = None - else: - if uses_mrope(self.config): - assert positions.ndim == 2 and positions.size(0) == 3, ( - "multimodal section rotary embedding requires " - f"(3, seq_len) positions, but got {positions.size()}") - inputs_embeds = self.get_input_embeddings_v0( - input_ids, - image_input=image_input, - video_input=video_input) - input_ids = None - hidden_states = self.language_model.model( input_ids=input_ids, positions=positions, diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index ace9c05daf15..22ddb1d75160 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -43,7 +43,7 @@ from .chatglm import ChatGLMBaseModel, ChatGLMModel from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) -from .utils import flatten_bn, isin_list +from .utils import flatten_bn class GLMVImagePixelInputs(TensorSchema): @@ -618,21 +618,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=isin_list(input_ids, [ - self.config.boi_token_id, - self.config.pad_token_id, - self.config.eoi_token_id, - ]), - ) - input_ids = None - hidden_states = self.transformer(input_ids, positions, intermediate_tensors, inputs_embeds) diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index 8a02da58ea0b..0ec451356f5e 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -765,17 +765,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - audio_embeds = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - audio_embeds, - is_multimodal=input_ids == self.config.audio_token_index, - ) - input_ids = None - model_output = self.language_model(input_ids, positions, intermediate_tensors, inputs_embeds) return model_output diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index f851688bf7ba..b0f9d5e2657e 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -45,8 +45,7 @@ from .clip import CLIPVisionModel from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .siglip import SiglipVisionModel -from .utils import (AutoWeightsLoader, init_vllm_registered_model, isin_list, - maybe_prefix) +from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix from .vision import get_vision_encoder_info EOT = "<|endofturn|>" @@ -747,18 +746,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - multimodal_embeddings, - is_multimodal=isin_list( - input_ids, - [self.config.image_token_id, self.config.video_token_id]), - ) - input_ids = None hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 3334ee224253..dddf1c6fb626 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -702,17 +702,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.config.image_token_id, - ) - input_ids = None - hidden_states = self.model.text_model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index 545dad1a96f5..0292845f819c 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -40,7 +40,7 @@ from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, - init_vllm_registered_model, isin_list, maybe_prefix) + init_vllm_registered_model, maybe_prefix) class InternS1MultiModalProjector(nn.Module): @@ -798,22 +798,6 @@ def forward( input_ids = None inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - context_token_ids = [ - token_id for token_id in (self.img_context_token_id, - self.video_context_token_id) - if token_id is not None - ] - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=isin_list(input_ids, context_token_ids), - ) - input_ids = None - forward_kwargs = { "input_ids": input_ids, "positions": positions, diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 78aac8541434..0c95c49f90b1 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -43,7 +43,7 @@ from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, - isin_list, maybe_prefix) + maybe_prefix) IMG_START = '' IMG_END = '' @@ -1371,22 +1371,6 @@ def forward( input_ids = None inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - context_token_ids = [ - token_id for token_id in (self.img_context_token_id, - self.video_context_token_id) - if token_id is not None - ] - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=isin_list(input_ids, context_token_ids), - ) - input_ids = None - forward_kwargs = { "input_ids": input_ids, "positions": positions, diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index db032736f914..30ec9029f74f 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -433,22 +433,6 @@ def forward( ) -> IntermediateTensors: if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner from - # `get_multimodal_embeddings` and `get_input_embeddings`, this - # condition is only for v0 compatibility. - elif inputs_embeds is None: - image_input = self._parse_and_validate_image_input(**kwargs) - if image_input is None: - inputs_embeds = None - else: - image_embeds = self._process_image_input(image_input) - inputs_embeds = self.get_input_embeddings( - input_ids, - image_embeds, - is_multimodal=input_ids == - self.config.media_placeholder_token_id, - ) - input_ids = None hidden_states = self.language_model( input_ids=input_ids, diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 6f3cfd88aee2..46cf93be191e 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -723,17 +723,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.config.image_token_index, - ) - input_ids = None - hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index e132389c4f06..c4f1daaab9bf 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -547,17 +547,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.config.image_token_index, - ) - input_ids = None - hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 2642d8c77cf3..aebc661d53f8 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -431,17 +431,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.config.video_token_index, - ) - input_ids = None - hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 906858f4e2f4..6088195c91d5 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -30,8 +30,7 @@ LlavaNextProcessingInfo) from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, - init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) + init_vllm_registered_model, maybe_prefix) # For profile run _MAX_FRAMES_PER_VIDEO = 16 @@ -850,33 +849,6 @@ def get_multimodal_embeddings(self, return multimodal_embeddings - def get_input_embeddings_v0( - self, - input_ids: torch.Tensor, - image_input: Optional[LlavaOnevisionImagePixelInputs] = None, - video_input: Optional[LlavaOnevisionVideoPixelInputs] = None, - ) -> torch.Tensor: - inputs_embeds = self.get_input_embeddings(input_ids) - if image_input is not None: - image_embeds = self._process_image_input(image_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - image_embeds, - placeholder_token_id=self.config.image_token_index, - ) - - if video_input is not None: - video_embeds = self._process_video_pixels(video_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - video_embeds, - placeholder_token_id=self.config.video_token_index, - ) - - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -894,22 +866,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner from - # `get_multimodal_embeddings` and `get_input_embeddings`, this - # condition is only for v0 compatibility. - elif inputs_embeds is None: - image_input = self._parse_and_validate_image_input(**kwargs) - video_input = self._parse_and_validate_video_input(**kwargs) - - if image_input is None and video_input is None: - inputs_embeds = None - else: - inputs_embeds = self.get_input_embeddings_v0( - input_ids, - image_input=image_input, - video_input=video_input) - input_ids = None - hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index bffc9a0c125e..eaa3839af37b 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -71,7 +71,7 @@ from .idefics2_vision_model import Idefics2VisionTransformer from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) -from .utils import AutoWeightsLoader, flatten_bn, isin_list, maybe_prefix +from .utils import AutoWeightsLoader, flatten_bn, maybe_prefix # For profile run _MAX_FRAMES_PER_VIDEO = 16 @@ -1154,19 +1154,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner from - # `get_multimodal_embeddings` and `get_input_embeddings`, this - # condition is only for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=isin_list(input_ids, list(self.mm_token_ids)), - ) - input_ids = None - hidden_states = self.llm.model( input_ids=input_ids, positions=positions, diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 31571ce962d1..e932f7f007f5 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -571,17 +571,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.config.image_token_index, - ) - input_ids = None - hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 3af5267928cd..db5a9fbc6a33 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -823,17 +823,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, - # this condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.config.image_token_index, - ) - input_ids = None - return self.language_model(input_ids, positions, intermediate_tensors, inputs_embeds) diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 054caee9e8a4..0227a83a1f55 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1490,17 +1490,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.img_patch_id, - ) - input_ids = None - hidden_states = self.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 505806a15c89..2d0ebdc90277 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -35,7 +35,7 @@ from vllm.model_executor.models.radio import RadioModel from vllm.model_executor.models.utils import (flatten_bn, init_vllm_registered_model, - isin_list, maybe_prefix) + maybe_prefix) from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargs, MultiModalKwargsItems, @@ -1135,22 +1135,6 @@ def forward( input_ids = None inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - context_token_ids = [ - token_id for token_id in (self.img_context_token_id, - self.video_context_token_id) - if token_id is not None - ] - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=isin_list(input_ids, context_token_ids), - ) - input_ids = None - hidden_states = self.language_model( input_ids=input_ids, positions=positions, diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py index 2627a262e958..0e7ec8e458cf 100644 --- a/vllm/model_executor/models/nemotron_vl.py +++ b/vllm/model_executor/models/nemotron_vl.py @@ -608,17 +608,6 @@ def forward( input_ids = None inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.img_context_token_id, - ) - input_ids = None - forward_kwargs = { "input_ids": input_ids, "positions": positions, diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index 8503d3f71d1c..2f9c6ddfc661 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -511,17 +511,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.image_pad_token_id, - ) - input_ids = None - # up until here we have an inputs_embeds 100% numerical identity # between the OG HF Transformers implementation and ours hidden_states = self.llm( diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py index 2ecc7bff07e0..86ce7e9eab27 100644 --- a/vllm/model_executor/models/ovis2_5.py +++ b/vllm/model_executor/models/ovis2_5.py @@ -596,18 +596,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.image_pad_token_id, - ) - input_ids = None - # up until here we have a inputs_embeds 100% numerical identity # between the OG HF Transformers implementation and ours hidden_states = self.llm( diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index f07f444819f4..d118e6c89ab5 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -370,17 +370,6 @@ def forward(self, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.config.image_token_index, - ) - input_ids = None - hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index ea34c8d92f13..59977796e2af 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -679,17 +679,6 @@ def forward(self, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=self.image_token_id, - ) - input_ids = None - hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py index e8b79717d75d..3dbb67d28065 100644 --- a/vllm/model_executor/models/phi4_multimodal.py +++ b/vllm/model_executor/models/phi4_multimodal.py @@ -1411,22 +1411,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner from - # `get_multimodal_embeddings` and `get_input_embeddings`, this - # condition is only for v0 compatibility. - elif inputs_embeds is None: - image_input = self._parse_and_validate_image_input(**kwargs) - audio_input = self._parse_and_validate_audio_input(**kwargs) - - if image_input is None and audio_input is None: - inputs_embeds = None - else: - inputs_embeds = self.get_input_embeddings_v0( - input_ids, - image_input=image_input, - audio_input=audio_input) - input_ids = None - hidden_states = self.language_model( input_ids, positions, diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 15b09c7ae2bc..8ccc7129ddb2 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -35,8 +35,7 @@ from .idefics2_vision_model import Idefics2VisionTransformer from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal from .phi4mm_audio import AudioEmbedding -from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix, - merge_multimodal_embeddings) +from .utils import AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix # <|endoftext10|> (see vocab.json in hf model) _IMAGE_PLACEHOLDER_TOKEN_ID = 200010 @@ -1174,35 +1173,6 @@ def get_multimodal_embeddings(self, return multimodal_embeddings - def get_input_embeddings_v0( - self, - input_ids: torch.Tensor, - image_input: Optional[Phi4MMImagePixelInputs] = None, - audio_input: Optional[Phi4MMAudioFeatureInputs] = None, - ) -> torch.Tensor: - audio_projection_mode = 'speech' - inputs_embeds = self.get_input_embeddings(input_ids) - if image_input is not None: - image_embeds = self._process_image_input(image_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - image_embeds, - placeholder_token_id=_IMAGE_PLACEHOLDER_TOKEN_ID, - ) - audio_projection_mode = 'vision' - - if audio_input is not None: - audio_embeds = self._process_audio_input( - audio_input, audio_projection_mode=audio_projection_mode) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - audio_embeds, - placeholder_token_id=_AUDIO_PLACEHOLDER_TOKEN_ID, - ) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -1214,22 +1184,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner from - # `get_multimodal_embeddings` and `get_input_embeddings`, this - # condition is only for v0 compatibility. - elif inputs_embeds is None: - image_input = self._parse_and_validate_image_input(**kwargs) - audio_input = self._parse_and_validate_audio_input(**kwargs) - - if image_input is None and audio_input is None: - inputs_embeds = None - else: - inputs_embeds = self.get_input_embeddings_v0( - input_ids, - image_input=image_input, - audio_input=audio_input) - input_ids = None - hidden_states = self.model( input_ids, positions, diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 2c04b6f0f4f9..6344fc394833 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -444,17 +444,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.vision_args.image_token_id, - ) - input_ids = None - hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index bfa398ee43b5..8f069710b0f9 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -69,8 +69,7 @@ from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) from .utils import (AutoWeightsLoader, WeightsMapper, - init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) + init_vllm_registered_model, maybe_prefix) try: import flash_attn @@ -908,26 +907,6 @@ def get_multimodal_embeddings_v0( multimodal_embeddings.append((video_embeds, "video")) return multimodal_embeddings - def get_input_embeddings_v0( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is None or len(multimodal_embeddings) == 0: - return inputs_embeds - - for embeddings, modality in multimodal_embeddings: - if modality == "audio": - placeholder_token_id = self.config.audio_token_index - if modality == "image": - placeholder_token_id = self.config.image_token_index - if modality == "video": - placeholder_token_id = self.config.video_token_index - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, embeddings, placeholder_token_id) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -939,14 +918,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - multimodal_embeddings = self.get_multimodal_embeddings_v0(**kwargs) - inputs_embeds = self.get_input_embeddings_v0( - input_ids, multimodal_embeddings) - input_ids = None - hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 5b092b42205f..da3889d31a7d 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -64,7 +64,6 @@ from vllm.multimodal.processing import PromptReplacement, PromptUpdate from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.config import uses_mrope from vllm.utils import is_pin_memory_available from vllm.utils.tensor_schema import TensorSchema, TensorShape @@ -75,8 +74,7 @@ from .qwen2_vl import (Qwen2VLMultiModalProcessor, Qwen2VLProcessingInfo, apply_rotary_pos_emb_vision) from .utils import (AutoWeightsLoader, WeightsMapper, cast_overflow_tensors, - init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) + init_vllm_registered_model, maybe_prefix) from .vision import get_vit_attn_backend, run_dp_sharded_mrope_vision_model logger = init_logger(__name__) @@ -1365,40 +1363,6 @@ def get_multimodal_embeddings(self, multimodal_embeddings += video_embeddings return multimodal_embeddings - def get_input_embeddings_v0( - self, - input_ids: torch.Tensor, - image_input: Optional[Qwen2_5_VLImageInputs] = None, - video_input: Optional[Qwen2_5_VLVideoInputs] = None, - ) -> torch.Tensor: - inputs_embeds = self.get_input_embeddings(input_ids) - if image_input is not None: - image_embeds = self._process_image_input(image_input) - if self.is_multimodal_pruning_enabled: - image_embeds = self._postprocess_image_embeds_evs( - image_embeds, image_input - ) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - image_embeds, - placeholder_token_id=self.config.image_token_id, - ) - - if video_input is not None: - video_embeds = self._process_video_input(video_input) - if self.is_multimodal_pruning_enabled: - video_embeds = self._postprocess_video_embeds_evs( - video_embeds, video_input - ) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - video_embeds, - placeholder_token_id=self.config.video_token_id, - ) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -1421,26 +1385,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner from - # `get_multimodal_embeddings` and `get_input_embeddings`, this - # condition is only for v0 compatibility. - elif inputs_embeds is None: - image_input = self._parse_and_validate_image_input(**kwargs) - video_input = self._parse_and_validate_video_input(**kwargs) - - if image_input is None and video_input is None: - inputs_embeds = None - else: - if uses_mrope(self.config): - assert positions.ndim == 2 and positions.size(0) == 3, ( - "multimodal section rotary embedding requires " - f"(3, seq_len) positions, but got {positions.size()}") - inputs_embeds = self.get_input_embeddings_v0( - input_ids, - image_input=image_input, - video_input=video_input) - input_ids = None - hidden_states = self.language_model.model( input_ids=input_ids, positions=positions, diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 9dfa29eef5ce..f9136863b8d6 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -449,17 +449,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - multimodal_embeddings, - is_multimodal=input_ids == self.config.audio_token_index, - ) - input_ids = None - hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 6ef01f333554..f83a411459cc 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -65,15 +65,13 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.platforms import _Backend, current_platform from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.config import uses_mrope from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMRoPE, SupportsMultiModal, SupportsPP) from .utils import (AutoWeightsLoader, WeightsMapper, - init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) + init_vllm_registered_model, maybe_prefix) from .vision import get_vit_attn_backend, run_dp_sharded_mrope_vision_model logger = init_logger(__name__) @@ -1464,32 +1462,6 @@ def get_multimodal_embeddings(self, return multimodal_embeddings - def get_input_embeddings_v0( - self, - input_ids: torch.Tensor, - image_input: Optional[Qwen2VLImagePixelInputs] = None, - video_input: Optional[Qwen2VLVideoPixelInputs] = None, - ) -> torch.Tensor: - inputs_embeds = self.get_input_embeddings(input_ids) - if image_input is not None: - image_embeds = self._process_image_input(image_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - image_embeds, - placeholder_token_id=self.config.image_token_id, - ) - - if video_input is not None: - video_embeds = self._process_video_input(video_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - video_embeds, - placeholder_token_id=self.config.video_token_id, - ) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -1515,26 +1487,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner from - # `get_multimodal_embeddings` and `get_input_embeddings`, this - # condition is only for v0 compatibility. - elif inputs_embeds is None: - image_input = self._parse_and_validate_image_input(**kwargs) - video_input = self._parse_and_validate_video_input(**kwargs) - - if image_input is None and video_input is None: - inputs_embeds = None - else: - if uses_mrope(self.config): - assert positions.ndim == 2 and positions.size(0) == 3, ( - "multimodal section rotary embedding requires " - f"(3, seq_len) positions, but got {positions.size()}") - inputs_embeds = self.get_input_embeddings_v0( - input_ids, - image_input=image_input, - video_input=video_input) - input_ids = None - hidden_states = self.language_model.model( input_ids=input_ids, positions=positions, diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 6d2a6019ef6f..ce92557d6424 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -68,7 +68,6 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.config import uses_mrope from vllm.utils import is_list_of from .interfaces import (MultiModalEmbeddings, SupportsLoRA, @@ -82,8 +81,7 @@ from .qwen2_vl import Qwen2VLProcessingInfo from .qwen3 import Qwen3ForCausalLM, Qwen3Model from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, - _merge_multimodal_embeddings, maybe_prefix, - merge_multimodal_embeddings) + _merge_multimodal_embeddings, maybe_prefix) from .vision import get_vit_attn_backend, run_dp_sharded_mrope_vision_model logger = init_logger(__name__) @@ -1464,75 +1462,6 @@ def get_input_embeddings( return inputs_embeds - def get_input_embeddings_v0( - self, - input_ids: torch.Tensor, - image_input: Optional[Qwen2_5_VLImageInputs] = None, - video_input: Optional[Qwen2_5_VLVideoInputs] = None, - ) -> torch.Tensor: - inputs_embeds = self.get_input_embeddings(input_ids) - - if self.use_deepstack: - visual_dim = inputs_embeds.shape[-1] - deepstack_input_embeds = None - if image_input is not None or video_input is not None: - deepstack_input_embeds = torch.zeros_like( - inputs_embeds).unsqueeze(1).repeat( - 1, self.deepstack_num_level, 1).flatten(1) - - if image_input is not None: - image_embeds = self._process_image_input(image_input) - if self.use_deepstack: - image_embeds = torch.cat(image_embeds) - - image_embeds, image_embeds_multiscale = image_embeds.split( - [visual_dim, visual_dim * self.deepstack_num_level], - dim=-1) - - deepstack_input_embeds = merge_multimodal_embeddings( - input_ids, - deepstack_input_embeds, - image_embeds_multiscale, - placeholder_token_id=self.config.image_token_id, - ) - - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - image_embeds, - placeholder_token_id=self.config.image_token_id, - ) - - if video_input is not None: - video_embeds = self._process_video_input(video_input) - if self.use_deepstack: - video_embeds = torch.cat(video_embeds) - - video_embeds, video_embeds_multiscale = video_embeds.split( - [visual_dim, visual_dim * self.deepstack_num_level], - dim=-1) - - deepstack_input_embeds = merge_multimodal_embeddings( - input_ids, - deepstack_input_embeds, - video_embeds_multiscale, - placeholder_token_id=self.config.video_token_id, - ) - - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - video_embeds, - placeholder_token_id=self.config.video_token_id, - ) - - if self.use_deepstack and deepstack_input_embeds is not None: - deepstack_input_embeds = deepstack_input_embeds.view( - inputs_embeds.shape[0], self.deepstack_num_level, - visual_dim).permute(1, 0, 2).contiguous() - self._set_deepstack_input_embeds(deepstack_input_embeds) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -1568,26 +1497,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner from - # `get_multimodal_embeddings` and `get_input_embeddings`, this - # condition is only for v0 compatibility. - elif inputs_embeds is None: - image_input = self._parse_and_validate_image_input(**kwargs) - video_input = self._parse_and_validate_video_input(**kwargs) - - if image_input is None and video_input is None: - inputs_embeds = None - else: - if uses_mrope(self.config): - assert positions.ndim == 2 and positions.size(0) == 3, ( - "multimodal section rotary embedding requires " - f"(3, seq_len) positions, but got {positions.size()}") - inputs_embeds = self.get_input_embeddings_v0( - input_ids, - image_input=image_input, - video_input=video_input) - input_ids = None - if self.use_deepstack and inputs_embeds is not None and get_pp_group( ).is_first_rank: deepstack_input_embeds = self._get_deepstack_input_embeds( diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index dc11b60604a9..924119ed63ab 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -767,18 +767,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == - self.transformer.visual.image_pad_id, - ) - input_ids = None - hidden_states = self.transformer(input_ids, positions, intermediate_tensors, inputs_embeds) return hidden_states diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index f9a107c06085..f03022aa719c 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -874,17 +874,6 @@ def forward( input_ids = None inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - vision_embeddings, - is_multimodal=input_ids == self.img_context_token_id, - ) - input_ids = None - forward_kwargs = { "input_ids": input_ids, "positions": positions, diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 7cfb639f675d..00d87f560e70 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -881,19 +881,6 @@ def forward( inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> Union[torch.Tensor, IntermediateTensors]: - # NOTE: In v1, inputs_embeds is always generated at model runner from - # `get_multimodal_embeddings` and `get_input_embeddings`, this - # condition is only for v0 compatibility. - if inputs_embeds is None: - multimodal_embeds = self.get_multimodal_embeddings(**kwargs) - if multimodal_embeds is not None: - inputs_embeds = self.get_input_embeddings( - input_ids, - multimodal_embeds, - is_multimodal=input_ids == self.config.image_token_id, - ) - input_ids = None - model_output = super().forward(input_ids, positions, intermediate_tensors, inputs_embeds) return model_output diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 77e886c22e63..70aabf6dfe78 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -597,18 +597,6 @@ def forward(self, if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) - - inputs_embeds = self.get_input_embeddings( - input_ids, - multimodal_embeddings, - is_multimodal=input_ids == self.config.audio_token_index, - ) - input_ids = None - language_model = self.language_model if hasattr(language_model, "language_model"): language_model = language_model.language_model diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index f93e7ccfd06f..1edeaeb0f319 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -371,19 +371,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - audio_encoder = self.tokenizer.instruct.audio_encoder - audio_tok_id = audio_encoder.audio_token - audio_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings( - input_ids, - audio_embeddings, - is_multimodal=input_ids == audio_tok_id, - ) - input_ids = None - hidden_states = self.language_model.model(input_ids, positions, intermediate_tensors, From 86502dcf502e0f2fa0d1fb7406b1de253e2861e6 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 29 Sep 2025 16:03:37 +0800 Subject: [PATCH 002/193] [Misc] Remove more `get_input_embeddings_v0` (#25857) Signed-off-by: DarkLight1337 Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> --- vllm/model_executor/models/gemma3n_mm.py | 3 +- vllm/model_executor/models/keye.py | 46 +------------------ vllm/model_executor/models/phi4_multimodal.py | 37 +-------------- vllm/model_executor/models/utils.py | 4 ++ 4 files changed, 7 insertions(+), 83 deletions(-) diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index b23437a08e5a..101e083ac123 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -45,8 +45,7 @@ from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsTranscription) from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, - init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) + init_vllm_registered_model, maybe_prefix) logger = init_logger(__name__) diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 62a71b7b1fa8..10b5c45169f4 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -41,7 +41,6 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.config import uses_mrope from vllm.utils import is_list_of from vllm.utils.tensor_schema import TensorSchema, TensorShape @@ -50,7 +49,7 @@ from .siglip import SiglipMLP from .utils import (AutoWeightsLoader, WeightsMapper, init_vllm_registered_model, is_pp_missing_parameter, - maybe_prefix, merge_multimodal_embeddings) + maybe_prefix) from .vision import get_vit_attn_backend logger = init_logger(__name__) @@ -1450,32 +1449,6 @@ def get_multimodal_embeddings( multimodal_embeddings += video_embeddings return multimodal_embeddings - def get_input_embeddings_v0( - self, - input_ids: torch.Tensor, - image_input: Optional[Any] = None, - video_input: Optional[Any] = None, - ) -> torch.Tensor: - inputs_embeds = self.get_input_embeddings(input_ids) - if image_input is not None: - image_embeds = self._process_image_input(image_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - image_embeds, - placeholder_token_id=self.config.image_token_id, - ) - - if video_input is not None: - video_embeds = self._process_video_input(video_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - video_embeds, - placeholder_token_id=self.config.video_token_id, - ) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -1500,23 +1473,6 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - elif inputs_embeds is None: - image_input = self._parse_and_validate_image_input(**kwargs) - video_input = self._parse_and_validate_video_input(**kwargs) - if image_input is None and video_input is None: - inputs_embeds = None - else: - if uses_mrope(self.config): - assert positions.ndim == 2 and positions.size(0) == 3, ( - "multimodal section rotary embedding requires " - f"(3, seq_len) positions, but got {positions.size()}") - inputs_embeds = self.get_input_embeddings_v0( - input_ids, - image_input=image_input, - video_input=video_input, - ) - input_ids = None - hidden_states = self.language_model.model( input_ids=input_ids, positions=positions, diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py index 3dbb67d28065..a4f9f96cb951 100644 --- a/vllm/model_executor/models/phi4_multimodal.py +++ b/vllm/model_executor/models/phi4_multimodal.py @@ -44,13 +44,7 @@ from .idefics2_vision_model import Idefics2VisionTransformer from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, - init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) - -# <|endoftext10|> (see vocab.json in hf model) -_IMAGE_PLACEHOLDER_TOKEN_ID = 200010 -# <|endoftext11|> -_AUDIO_PLACEHOLDER_TOKEN_ID = 200011 + init_vllm_registered_model, maybe_prefix) _AUDIO_MAX_SOUNDFILE_SIZE = 241_000 @@ -1371,35 +1365,6 @@ def get_multimodal_embeddings(self, return multimodal_embeddings - def get_input_embeddings_v0( - self, - input_ids: torch.Tensor, - image_input: Optional[Phi4MMImagePixelInputs] = None, - audio_input: Optional[Phi4MMAudioFeatureInputs] = None, - ) -> torch.Tensor: - audio_projection_mode = 'speech' - inputs_embeds = self.get_input_embeddings(input_ids) - if image_input is not None: - image_embeds = self._process_image_input(image_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - image_embeds, - placeholder_token_id=_IMAGE_PLACEHOLDER_TOKEN_ID, - ) - audio_projection_mode = 'vision' - - if audio_input is not None: - audio_embeds = self._process_audio_input( - audio_input, audio_projection_mode=audio_projection_mode) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - audio_embeds, - placeholder_token_id=_AUDIO_PLACEHOLDER_TOKEN_ID, - ) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 4bf151fbf62d..d6fa88f06e56 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -10,6 +10,7 @@ import torch.nn as nn from torch.func import functional_call from transformers import PretrainedConfig +from typing_extensions import deprecated import vllm.envs as envs from vllm.config import VllmConfig @@ -439,6 +440,9 @@ def _merge_multimodal_embeddings( return inputs_embeds +@deprecated("`merge_multimodal_embeddings` has been replaced with " + "`SupportsMultiModal.get_input_embeddings` and will be " + "removed in v0.12.") def merge_multimodal_embeddings( input_ids: torch.Tensor, inputs_embeds: torch.Tensor, From 219bc0b4e0afc5ebe2583a2022b57c6ec5efa29f Mon Sep 17 00:00:00 2001 From: Tomer Asida <57313761+tomeras91@users.noreply.github.com> Date: Mon, 29 Sep 2025 20:13:26 +0300 Subject: [PATCH 003/193] refactor - pass tokens_per_frame and num_frames to compute_retained_tokens_count so code can be reused in nano_nemotrron_vl which doesn't have thw Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> --- vllm/model_executor/models/qwen2_5_vl.py | 7 +++++-- vllm/multimodal/evs.py | 14 +++++++------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index da3889d31a7d..d6e0d5f00b1e 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -939,9 +939,12 @@ def get_replacement_qwen2vl(item_idx: int, modality: str): ).video_pruning_rate if (modality == "video" and video_pruning_rate is not None and video_pruning_rate > 0.0): + T, H, W = map(int, grid_thw) + tokens_per_frame = (H // image_processor.merge_size) * ( + W // image_processor.merge_size) num_tokens = compute_retained_tokens_count( - grid_thw, - image_processor.merge_size, + tokens_per_frame, + T, video_pruning_rate, ) # End of EVS-specific code diff --git a/vllm/multimodal/evs.py b/vllm/multimodal/evs.py index 056f3d905968..79f1f4f88aea 100644 --- a/vllm/multimodal/evs.py +++ b/vllm/multimodal/evs.py @@ -13,24 +13,24 @@ import torch -def compute_retained_tokens_count(video_size_thw: torch.LongTensor, - spatial_merge_size: int, q: float) -> int: +def compute_retained_tokens_count(tokens_per_frame: int, num_frames: int, + q: float) -> int: """ Compute the number of retained tokens for a given video. Method ensures that we retain all the tokens from the first frame regardless of the pruning rate. Args: - video_size_thw: The size of the video in the format of (T, H, W). - spatial_merge_size: The size of the spatial merge. + tokens_per_frame: The number of tokens per frame. + num_frames: The total number of frames. q: The pruning rate. Returns: The number of retained tokens. """ - T, H, W = map(int, video_size_thw) - min_num_tokens = (H // spatial_merge_size) * (W // spatial_merge_size) - evs_num_tokens = int(T * min_num_tokens * (1 - q)) + total_tokens = tokens_per_frame * num_frames + evs_num_tokens = int(total_tokens * (1 - q)) + min_num_tokens = tokens_per_frame return max(min_num_tokens, evs_num_tokens) From 23a205fd436a11e0305e954f7c6ed2f64745e0ca Mon Sep 17 00:00:00 2001 From: Tomer Asida <57313761+tomeras91@users.noreply.github.com> Date: Tue, 30 Sep 2025 23:30:53 +0300 Subject: [PATCH 004/193] WIP - commit with all commented code Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> --- .../model_executor/models/nano_nemotron_vl.py | 55 ++++++++++++++++--- 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 2d0ebdc90277..94fa5346a223 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -44,11 +44,11 @@ ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, - PromptUpdate, PromptUpdateDetails) + PromptUpdate, PromptUpdateDetails, _seq2tokens) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.radio import RadioConfig -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.transformers_utils.tokenizer import AnyTokenizer, cached_tokenizer_from_config, encode_tokens from vllm.utils.tensor_schema import TensorSchema, TensorShape # Configure PIL to handle large images without warnings @@ -483,20 +483,26 @@ def get_image_repl( return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT) + @classmethod def get_video_repl( - self, + cls, feature_size: int, num_patches: Optional[int] = None, + # feature_size_per_patch: list[int], + # num_patches: int, video_context_token: str = IMG_CONTEXT, ) -> PromptUpdateDetails[str]: - repl_features = video_context_token * self.num_image_token + repl_features = video_context_token * feature_size repl_features_with_sep = IMG_START + repl_features + IMG_END + # repl_features_with_sep = lambda x: IMG_START + video_context_token * feature_size_per_patch[x] + IMG_END # num_patches is equal to num_frames repl_full = ''.join([ f'Frame{i+1}: {repl_features_with_sep}' for i in range(num_patches) + # f'Frame{i+1}: {repl_features_with_sep(i)}' for i in range(num_patches) ]) - return PromptUpdateDetails.select_text(repl_full, video_context_token) + # return PromptUpdateDetails.select_text(repl_full, video_context_token) + return PromptUpdateDetails.select_text(repl_full, repl_full) class BaseNanoNemotronVLProcessingInfo(BaseProcessingInfo): @@ -784,9 +790,22 @@ def get_video_replacement_internvl(item_idx: int): if num_patches is not None: assert isinstance(num_patches, int) + # # EVS-specific code + # video_pruning_rate = self.info.ctx.get_mm_config().video_pruning_rate + # if video_pruning_rate is not None and video_pruning_rate > 0.0: + # T, H, W = map(int, grid_thw) + # tokens_per_frame = (H // image_processor.merge_size) * ( + # W // image_processor.merge_size) + # num_tokens = compute_retained_tokens_count( + # tokens_per_frame, + # T, + # video_pruning_rate, + # ) + # # End of EVS-specific code + return hf_processor.get_video_repl( - feature_size, - num_patches, + feature_size, # number of tokens per frame + num_patches, # number of frames video_context_token=hf_processor.video_token) if self.info.supports_video: @@ -901,6 +920,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.vision_model = self.get_vit_model_from_radio_config(config).to( self.language_model.config.torch_dtype) + self.tokenizer = cached_tokenizer_from_config(vllm_config.model_config) + # Construct the vision projection. vit_hidden_size = config.vit_hidden_size vision_projection_hidden_size = config.projector_hidden_size @@ -1116,6 +1137,26 @@ def get_multimodal_embeddings(self, if modality == "videos": video_input = modalities["videos"] video_embeddings = self._process_image_input(video_input) + # num_patches = video_input["num_patches"] + # feature_size = int(video_embeddings[0].shape[0] / num_patches) + # construct video_repl + mask with repl = PromptUpdateDetails.select_text(repl_full, video_context_token) + # create larger zeros tensor with shape (len(repl.is_embed),hidden_size). Called X for now + # X[repl.is_embed] = video_embeddings + # X[~repl.is_embed] = self.llm.get_input_embeddings(repl tokens where repl.is_embed is False) + # video_embeddings = X + num_patches = video_input["num_patches"][0].item() + assert video_embeddings[0].shape[0] % num_patches == 0 + feature_size = video_embeddings[0].shape[0] // num_patches + device = video_embeddings[0].device + video_repl_text = NanoNemotronVLProcessor.get_video_repl(feature_size, num_patches, IMG_CONTEXT).full + repl_token_ids = torch.tensor(_seq2tokens(self.tokenizer, video_repl_text), device=device) + embed_token_ids = torch.tensor(encode_tokens(self.tokenizer, IMG_CONTEXT), device=device) # TODO: Can just use ID. this also adds BOS but that doesn't exist in repl_token_ids since _seq2tokens uses add_special_tokens=False + is_video_embed = torch.isin(repl_token_ids, embed_token_ids) + video_repl_embeddings = torch.empty(repl_token_ids.shape[0], video_embeddings[0].shape[1], dtype=video_embeddings[0].dtype, device=device) + video_repl_embeddings[is_video_embed] = video_embeddings[0] + video_repl_embeddings[~is_video_embed] = self.language_model.get_input_embeddings(repl_token_ids[~is_video_embed]) + video_embeddings = (video_repl_embeddings,) + multimodal_embeddings += video_embeddings return multimodal_embeddings From e8fd68a5c174dcb724fcbe9eaca315a379aca19b Mon Sep 17 00:00:00 2001 From: Tomer Asida <57313761+tomeras91@users.noreply.github.com> Date: Tue, 30 Sep 2025 23:33:32 +0300 Subject: [PATCH 005/193] Revert "WIP - commit with all commented code" This reverts commit c5dad7e180778bed9759bf3044e8ed298e04b195. Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> --- .../model_executor/models/nano_nemotron_vl.py | 55 +++---------------- 1 file changed, 7 insertions(+), 48 deletions(-) diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 94fa5346a223..2d0ebdc90277 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -44,11 +44,11 @@ ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, - PromptUpdate, PromptUpdateDetails, _seq2tokens) + PromptUpdate, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.radio import RadioConfig -from vllm.transformers_utils.tokenizer import AnyTokenizer, cached_tokenizer_from_config, encode_tokens +from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils.tensor_schema import TensorSchema, TensorShape # Configure PIL to handle large images without warnings @@ -483,26 +483,20 @@ def get_image_repl( return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT) - @classmethod def get_video_repl( - cls, + self, feature_size: int, num_patches: Optional[int] = None, - # feature_size_per_patch: list[int], - # num_patches: int, video_context_token: str = IMG_CONTEXT, ) -> PromptUpdateDetails[str]: - repl_features = video_context_token * feature_size + repl_features = video_context_token * self.num_image_token repl_features_with_sep = IMG_START + repl_features + IMG_END - # repl_features_with_sep = lambda x: IMG_START + video_context_token * feature_size_per_patch[x] + IMG_END # num_patches is equal to num_frames repl_full = ''.join([ f'Frame{i+1}: {repl_features_with_sep}' for i in range(num_patches) - # f'Frame{i+1}: {repl_features_with_sep(i)}' for i in range(num_patches) ]) - # return PromptUpdateDetails.select_text(repl_full, video_context_token) - return PromptUpdateDetails.select_text(repl_full, repl_full) + return PromptUpdateDetails.select_text(repl_full, video_context_token) class BaseNanoNemotronVLProcessingInfo(BaseProcessingInfo): @@ -790,22 +784,9 @@ def get_video_replacement_internvl(item_idx: int): if num_patches is not None: assert isinstance(num_patches, int) - # # EVS-specific code - # video_pruning_rate = self.info.ctx.get_mm_config().video_pruning_rate - # if video_pruning_rate is not None and video_pruning_rate > 0.0: - # T, H, W = map(int, grid_thw) - # tokens_per_frame = (H // image_processor.merge_size) * ( - # W // image_processor.merge_size) - # num_tokens = compute_retained_tokens_count( - # tokens_per_frame, - # T, - # video_pruning_rate, - # ) - # # End of EVS-specific code - return hf_processor.get_video_repl( - feature_size, # number of tokens per frame - num_patches, # number of frames + feature_size, + num_patches, video_context_token=hf_processor.video_token) if self.info.supports_video: @@ -920,8 +901,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.vision_model = self.get_vit_model_from_radio_config(config).to( self.language_model.config.torch_dtype) - self.tokenizer = cached_tokenizer_from_config(vllm_config.model_config) - # Construct the vision projection. vit_hidden_size = config.vit_hidden_size vision_projection_hidden_size = config.projector_hidden_size @@ -1137,26 +1116,6 @@ def get_multimodal_embeddings(self, if modality == "videos": video_input = modalities["videos"] video_embeddings = self._process_image_input(video_input) - # num_patches = video_input["num_patches"] - # feature_size = int(video_embeddings[0].shape[0] / num_patches) - # construct video_repl + mask with repl = PromptUpdateDetails.select_text(repl_full, video_context_token) - # create larger zeros tensor with shape (len(repl.is_embed),hidden_size). Called X for now - # X[repl.is_embed] = video_embeddings - # X[~repl.is_embed] = self.llm.get_input_embeddings(repl tokens where repl.is_embed is False) - # video_embeddings = X - num_patches = video_input["num_patches"][0].item() - assert video_embeddings[0].shape[0] % num_patches == 0 - feature_size = video_embeddings[0].shape[0] // num_patches - device = video_embeddings[0].device - video_repl_text = NanoNemotronVLProcessor.get_video_repl(feature_size, num_patches, IMG_CONTEXT).full - repl_token_ids = torch.tensor(_seq2tokens(self.tokenizer, video_repl_text), device=device) - embed_token_ids = torch.tensor(encode_tokens(self.tokenizer, IMG_CONTEXT), device=device) # TODO: Can just use ID. this also adds BOS but that doesn't exist in repl_token_ids since _seq2tokens uses add_special_tokens=False - is_video_embed = torch.isin(repl_token_ids, embed_token_ids) - video_repl_embeddings = torch.empty(repl_token_ids.shape[0], video_embeddings[0].shape[1], dtype=video_embeddings[0].dtype, device=device) - video_repl_embeddings[is_video_embed] = video_embeddings[0] - video_repl_embeddings[~is_video_embed] = self.language_model.get_input_embeddings(repl_token_ids[~is_video_embed]) - video_embeddings = (video_repl_embeddings,) - multimodal_embeddings += video_embeddings return multimodal_embeddings From 859e9f1cede57337aea079c98faba0115d43908f Mon Sep 17 00:00:00 2001 From: Tomer Asida <57313761+tomeras91@users.noreply.github.com> Date: Wed, 1 Oct 2025 00:16:07 +0300 Subject: [PATCH 006/193] Manually deal with video prompt replacement instead of relying on vLLM mechanism: 1. get_video_repl now doesn't mask the indicator tokens - it signals vLLM to replace all placeholder embeddings with the video embeddings returned by get_multimodal_embeddings 2. get_multimodal_embeddings handles interleaving video embeddings with text embeddings for indicator tokens 3. This is done by creating the video replacement text again in get_multimodal_embeddings, tokenizing it, and masking the indicator tokens. Indicator tokens embeddings are calculated by calling self.language_model.get_input_embeddings() directly 4. The tokenizer was added to NemotronH_Nano_VL_V2, to allow for tokenizing in get_multimodal_embeddings() Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> --- .../model_executor/models/nano_nemotron_vl.py | 104 ++++++++++++++++-- 1 file changed, 95 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 2d0ebdc90277..3d93adc6ac54 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -44,11 +44,14 @@ ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, - PromptUpdate, PromptUpdateDetails) + PromptUpdate, PromptUpdateDetails, + _seq2tokens) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.radio import RadioConfig -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.transformers_utils.tokenizer import (AnyTokenizer, + cached_tokenizer_from_config, + encode_tokens) from vllm.utils.tensor_schema import TensorSchema, TensorShape # Configure PIL to handle large images without warnings @@ -483,20 +486,21 @@ def get_image_repl( return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT) + @classmethod def get_video_repl( - self, + cls, feature_size: int, - num_patches: Optional[int] = None, + num_patches: int, video_context_token: str = IMG_CONTEXT, ) -> PromptUpdateDetails[str]: - repl_features = video_context_token * self.num_image_token + repl_features = video_context_token * feature_size repl_features_with_sep = IMG_START + repl_features + IMG_END # num_patches is equal to num_frames repl_full = ''.join([ f'Frame{i+1}: {repl_features_with_sep}' for i in range(num_patches) ]) - return PromptUpdateDetails.select_text(repl_full, video_context_token) + return PromptUpdateDetails.select_text(repl_full, repl_full) class BaseNanoNemotronVLProcessingInfo(BaseProcessingInfo): @@ -784,9 +788,20 @@ def get_video_replacement_internvl(item_idx: int): if num_patches is not None: assert isinstance(num_patches, int) + # # EVS-specific code + # video_pruning_rate = \ + # self.info.ctx.get_mm_config().video_pruning_rate + # if video_pruning_rate is not None and video_pruning_rate > 0.0: + # num_tokens = compute_retained_tokens_count( + # feature_size, + # num_patches, + # video_pruning_rate, + # ) + # # End of EVS-specific code + return hf_processor.get_video_repl( - feature_size, - num_patches, + feature_size, # number of tokens per frame + num_patches, # number of frames video_context_token=hf_processor.video_token) if self.info.supports_video: @@ -901,6 +916,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.vision_model = self.get_vit_model_from_radio_config(config).to( self.language_model.config.torch_dtype) + self.tokenizer = cached_tokenizer_from_config(vllm_config.model_config) + # Construct the vision projection. vit_hidden_size = config.vit_hidden_size vision_projection_hidden_size = config.projector_hidden_size @@ -1035,6 +1052,75 @@ def _process_image_input( ] return image_embeds.split(image_feature_sizes) + def _process_video_input( + self, video_input: NanoNemotronVLVideoPixelInputs + ) -> tuple[torch.Tensor, ...]: + """Process video input and create final embeddings with video content + and indicator tokens.""" + # Get video embeddings using the same processing as images + video_embeddings = self._process_image_input(video_input) + + # Calculate video feature dimensions (number of frames and + # their feature size (AKA tokens per frame)) + num_patches = video_input["num_patches"][0].item() + assert video_embeddings[0].shape[0] % num_patches == 0 + feature_size = video_embeddings[0].shape[0] // num_patches + + # Create final embeddings that will replace placeholder embeddings + # with video content and indicator tokens + final_video_embeddings = self._create_final_video_embeddings( + video_embeddings[0], feature_size, num_patches) + + return (final_video_embeddings, ) + + def _create_final_video_embeddings(self, video_embeddings: torch.Tensor, + feature_size: int, + num_patches: int) -> torch.Tensor: + """Create final embeddings that combine video embeddings with + text embeddings of indicator tokens. + + These final embeddings contain: + - Actual video embeddings in positions corresponding to video content + - Text embeddings for indicator tokens (, , and + frame separation text) in their respective positions + + These embeddings will replace the placeholder embeddings to create + input_embeds for the LLM. + """ + device = video_embeddings.device + + # Generate video replacement text and convert to token IDs + video_repl_text = NanoNemotronVLProcessor.get_video_repl( + feature_size, num_patches, IMG_CONTEXT).full + repl_token_ids = torch.tensor(_seq2tokens(self.tokenizer, + video_repl_text), + device=device) + + # Get embedding token IDs for image context + embed_token_ids = torch.tensor(encode_tokens(self.tokenizer, + IMG_CONTEXT), + device=device) + + # Create mask for video embedding positions + is_video_embed = torch.isin(repl_token_ids, embed_token_ids) + + # Initialize final embeddings tensor + final_video_embeddings = torch.empty(repl_token_ids.shape[0], + video_embeddings.shape[1], + dtype=video_embeddings.dtype, + device=device) + + # Replace video embedding positions with actual video embeddings + final_video_embeddings[is_video_embed] = video_embeddings + + # Replace non-video positions with language model embeddings. + # These are the indicator tokens + text_embeddings = self.language_model.get_input_embeddings( + repl_token_ids[~is_video_embed]) + final_video_embeddings[~is_video_embed] = text_embeddings + + return final_video_embeddings + def _parse_and_validate_video_input( self, **kwargs: object) -> Optional[NanoNemotronVLVideoPixelInputs]: @@ -1115,7 +1201,7 @@ def get_multimodal_embeddings(self, multimodal_embeddings += vision_embeddings if modality == "videos": video_input = modalities["videos"] - video_embeddings = self._process_image_input(video_input) + video_embeddings = self._process_video_input(video_input) multimodal_embeddings += video_embeddings return multimodal_embeddings From 69ea5b88dce0363ef3334979a72f4a5c4ca90f62 Mon Sep 17 00:00:00 2001 From: Tomer Asida <57313761+tomeras91@users.noreply.github.com> Date: Wed, 1 Oct 2025 00:43:40 +0300 Subject: [PATCH 007/193] support multiple videos in a batch (and better typehints) Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> --- .../model_executor/models/nano_nemotron_vl.py | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 3d93adc6ac54..28b39b8e7f82 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -1028,7 +1028,8 @@ def _parse_and_validate_image_input( raise AssertionError("This line should be unreachable.") def _process_image_input( - self, image_input: NanoNemotronVLImageInputs) -> torch.Tensor: + self, image_input: NanoNemotronVLImageInputs + ) -> tuple[torch.Tensor, ...]: if image_input["type"] == "image_embeds": return image_input["data"] @@ -1060,18 +1061,22 @@ def _process_video_input( # Get video embeddings using the same processing as images video_embeddings = self._process_image_input(video_input) - # Calculate video feature dimensions (number of frames and - # their feature size (AKA tokens per frame)) - num_patches = video_input["num_patches"][0].item() - assert video_embeddings[0].shape[0] % num_patches == 0 - feature_size = video_embeddings[0].shape[0] // num_patches + final_video_embeddings: tuple[torch.Tensor, ...] = () - # Create final embeddings that will replace placeholder embeddings - # with video content and indicator tokens - final_video_embeddings = self._create_final_video_embeddings( - video_embeddings[0], feature_size, num_patches) + # Calculate video feature dimensions (number of frames and + # their feature size (AKA tokens per frame)) + # TODO: Maybe this can be optimized to avoid the loop? + for i, single_video_embeddings in enumerate(video_embeddings): + num_patches = video_input["num_patches"][i].item() + assert single_video_embeddings.shape[0] % num_patches == 0 + feature_size = single_video_embeddings.shape[0] // num_patches + + # Create final embeddings that will replace placeholder embeddings + # with video content and indicator tokens + final_video_embeddings += (self._create_final_video_embeddings( + single_video_embeddings, feature_size, num_patches), ) - return (final_video_embeddings, ) + return final_video_embeddings def _create_final_video_embeddings(self, video_embeddings: torch.Tensor, feature_size: int, From 0adec4b430dfaade86656575a834b45e46c692a5 Mon Sep 17 00:00:00 2001 From: Tomer Asida <57313761+tomeras91@users.noreply.github.com> Date: Wed, 1 Oct 2025 00:50:33 +0300 Subject: [PATCH 008/193] Add EVS TODOs Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> --- vllm/model_executor/models/nano_nemotron_vl.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 28b39b8e7f82..329b880f5c49 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -486,6 +486,7 @@ def get_image_repl( return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT) + # TODO (EVS): this method should get the number of tokens (AKA feature size) per frame, ant not assume it is equal across frames @classmethod def get_video_repl( cls, @@ -788,7 +789,7 @@ def get_video_replacement_internvl(item_idx: int): if num_patches is not None: assert isinstance(num_patches, int) - # # EVS-specific code + # # TODO: EVS-specific code here. This is basically copied from Qwen2-VL. Need to validate it. # video_pruning_rate = \ # self.info.ctx.get_mm_config().video_pruning_rate # if video_pruning_rate is not None and video_pruning_rate > 0.0: @@ -1067,6 +1068,13 @@ def _process_video_input( # their feature size (AKA tokens per frame)) # TODO: Maybe this can be optimized to avoid the loop? for i, single_video_embeddings in enumerate(video_embeddings): + + # TODO (EVS): Add EVS code here. This is only a suggestion and maybe there's a better way to do it. + # Compute retention mask and prune the video embeddings. + # Then, pass number of retained tokens per frame to the _create_final_video_embeddings function, + # which will use it to create the video_repl_text with correct number of tokens per frame. + # EVS compute_retention_mask will need to change a bit, since we don't have here any THW data (although maybe it can be computed from num_patches and feature_size... I'm not sure) + num_patches = video_input["num_patches"][i].item() assert single_video_embeddings.shape[0] % num_patches == 0 feature_size = single_video_embeddings.shape[0] // num_patches From d1a4d414f1ea3efbe667e52b9f24a786b2033c53 Mon Sep 17 00:00:00 2001 From: Tomer Asida <57313761+tomeras91@users.noreply.github.com> Date: Wed, 1 Oct 2025 01:24:36 +0300 Subject: [PATCH 009/193] access tokenizer only when needed instead of saving it as attribute of NemotronH_Nano_VL_V2 Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> --- vllm/model_executor/models/nano_nemotron_vl.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 329b880f5c49..8d8c1f6dd73a 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -917,8 +917,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.vision_model = self.get_vit_model_from_radio_config(config).to( self.language_model.config.torch_dtype) - self.tokenizer = cached_tokenizer_from_config(vllm_config.model_config) - # Construct the vision projection. vit_hidden_size = config.vit_hidden_size vision_projection_hidden_size = config.projector_hidden_size @@ -943,6 +941,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.img_context_token_id = None self.video_context_token_id = None self.config = config + self.model_config = vllm_config.model_config def pixel_shuffle(self, x, scale_factor=0.5): n, w, h, c = x.size() @@ -1105,7 +1104,8 @@ def _create_final_video_embeddings(self, video_embeddings: torch.Tensor, # Generate video replacement text and convert to token IDs video_repl_text = NanoNemotronVLProcessor.get_video_repl( feature_size, num_patches, IMG_CONTEXT).full - repl_token_ids = torch.tensor(_seq2tokens(self.tokenizer, + tokenizer = cached_tokenizer_from_config(self.model_config) + repl_token_ids = torch.tensor(_seq2tokens(tokenizer, video_repl_text), device=device) From a7417d05fdd81f375a9102a09849547e8e9d6607 Mon Sep 17 00:00:00 2001 From: Eugene Khvedchenia Date: Fri, 3 Oct 2025 17:35:18 +0300 Subject: [PATCH 010/193] Fix issue with using top-left tile instead of thumbnail tile Signed-off-by: Eugene Khvedchenia Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> --- .../model_executor/models/nano_nemotron_vl.py | 87 ++++++++++++++----- 1 file changed, 66 insertions(+), 21 deletions(-) diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 8d8c1f6dd73a..7a0065d71151 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -37,6 +37,7 @@ init_vllm_registered_model, maybe_prefix) from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.evs import compute_retained_tokens_count from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargs, MultiModalKwargsItems, NestedTensors) @@ -203,6 +204,8 @@ def video_to_pixel_values( # with image path frames_tensors: list[torch.Tensor] = [] for frame in video: + # (ekvhedchenia) TODO: we probably should not use tiling at all for videos as we take + # thumbnail tile of fixed size anyway pil_frame = dynamic_preprocess( Image.fromarray(frame, mode="RGB"), image_size=input_size, @@ -212,7 +215,9 @@ def video_to_pixel_values( ) # dynamic_preprocess returns tensors already; take the single tile assert len(pil_frame) >= 1 - frames_tensors.append(pil_frame[0]) + # frames_tensors.append(pil_frame[0]) + # (ekvhedchenia) I think what we meant is take thumbnail tile (Which happen to be last one) + frames_tensors.append(pil_frame[-1]) return torch.stack(frames_tensors) @@ -435,6 +440,7 @@ def _preprocess_video( video_repl = self.get_video_repl(self.num_image_token, num_patches, self.video_token) + text = [t.replace('