diff --git a/tests/models/multimodal/pooling/test_siglip.py b/tests/models/multimodal/pooling/test_siglip.py index 3345b10c099a..c973676ba027 100644 --- a/tests/models/multimodal/pooling/test_siglip.py +++ b/tests/models/multimodal/pooling/test_siglip.py @@ -19,7 +19,12 @@ } ) -MODELS = ["google/siglip-base-patch16-224", "google/siglip2-base-patch16-224"] +MODELS = [ + "google/siglip-base-patch16-224", + "google/siglip2-base-patch16-224", + # Different image embedding dim than text_config.hidden_size + "google/siglip2-giant-opt-patch16-384", +] def _run_test( diff --git a/vllm/config/model.py b/vllm/config/model.py index 6b3657381354..92cd48402a65 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1202,6 +1202,16 @@ def get_vocab_size(self) -> int: def get_hidden_size(self) -> int: return getattr(self.hf_text_config, "hidden_size", 0) + def get_inputs_embeds_size(self) -> int: + # The size of inputs_embeds is usually identical to the size + # of the hidden states, however there are exceptions, such as + # embedding models like CLIP and SigLIP + for target_attr in ("projection_dim", "projection_size"): + if hasattr(self.hf_text_config, target_attr): + return getattr(self.hf_text_config, target_attr) + + return self.get_hidden_size() + @property def is_deepseek_mla(self) -> bool: if not hasattr(self.hf_text_config, "model_type"): diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index c2993b47dc3f..b8af3050990b 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Iterable, Mapping, Sequence +from collections.abc import Callable, Iterable, Mapping, Sequence from functools import cached_property from typing import Annotated, Literal @@ -903,6 +903,41 @@ def _process_image_inputs(self, inputs: CLIPImagePixelInputs) -> torch.Tensor: def get_language_model(self) -> torch.nn.Module: return self.text_model + def _embed_text_input_ids( + self, + input_ids: torch.Tensor, + embed_input_ids: Callable[[torch.Tensor], torch.Tensor], + *, + is_multimodal: torch.Tensor | None, + handle_oov_mm_token: bool, + ) -> torch.Tensor: + inputs_embeds = super()._embed_text_input_ids( + input_ids, + embed_input_ids, + is_multimodal=is_multimodal, + handle_oov_mm_token=handle_oov_mm_token, + ) + + # NOTE: inputs_embeds in model runner has size text_config.projection_dim + # (instead of text_config.hidden_size) to accommodate image embeddings + inputs_embeds_size = self.projection_dim + if inputs_embeds.shape[1] < inputs_embeds_size: + inputs_embeds = torch.cat( + [ + inputs_embeds, + inputs_embeds.new_empty( + inputs_embeds.shape[0], + inputs_embeds_size - inputs_embeds.shape[1], + ), + ], + dim=1, + ) + elif inputs_embeds.shape[1] > inputs_embeds_size: + # No need to handle this case for now + raise NotImplementedError + + return inputs_embeds + def embed_input_ids( self, input_ids: torch.Tensor, @@ -949,10 +984,16 @@ def forward( if not self._is_text_input: return inputs_embeds - # Text inputs - return self.get_text_features( - input_ids=input_ids, position_ids=positions, inputs_embeds=inputs_embeds - ) + # NOTE: inputs_embeds in model runner has size text_config.projection_dim + # (instead of text_config.hidden_size) to accommodate image embeddings + hidden_size = self.text_embed_dim + if inputs_embeds.shape[1] > hidden_size: + inputs_embeds = inputs_embeds[:, :hidden_size] + elif inputs_embeds.shape[1] < hidden_size: + # No need to handle this case for now + raise NotImplementedError + + return self.get_text_features(input_ids, positions, inputs_embeds) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index ce5847bf79a5..9db1423d98e0 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -1,10 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Implementation of SiglipVisionModel intended to be only used -within a vision language model.""" import math -from collections.abc import Iterable, Mapping +from collections.abc import Callable, Iterable, Mapping from functools import cached_property from typing import Annotated, Literal @@ -976,6 +974,7 @@ def forward( position_embeddings = self.position_embedding(position_ids) embeddings = inputs_embeds + position_embeddings + return embeddings @@ -1145,6 +1144,41 @@ def _process_image_inputs(self, inputs: SiglipImagePixelInputs) -> torch.Tensor: def get_language_model(self) -> torch.nn.Module: return self.text_model + def _embed_text_input_ids( + self, + input_ids: torch.Tensor, + embed_input_ids: Callable[[torch.Tensor], torch.Tensor], + *, + is_multimodal: torch.Tensor | None, + handle_oov_mm_token: bool, + ) -> torch.Tensor: + inputs_embeds = super()._embed_text_input_ids( + input_ids, + embed_input_ids, + is_multimodal=is_multimodal, + handle_oov_mm_token=handle_oov_mm_token, + ) + + # NOTE: inputs_embeds in model runner has size text_config.projection_size + # (instead of text_config.hidden_size) to accommodate image embeddings + inputs_embeds_size = self.text_projection_size + if inputs_embeds.shape[1] < inputs_embeds_size: + inputs_embeds = torch.cat( + [ + inputs_embeds, + inputs_embeds.new_empty( + inputs_embeds.shape[0], + inputs_embeds_size - inputs_embeds.shape[1], + ), + ], + dim=1, + ) + elif inputs_embeds.shape[1] > inputs_embeds_size: + # No need to handle this case for now + raise NotImplementedError + + return inputs_embeds + def embed_input_ids( self, input_ids: torch.Tensor, @@ -1190,6 +1224,15 @@ def forward( if not self._is_text_input: return inputs_embeds + # NOTE: inputs_embeds in model runner has size text_config.projection_size + # (instead of text_config.hidden_size) to accommodate image embeddings + hidden_size = self.text_embed_dim + if inputs_embeds.shape[1] > hidden_size: + inputs_embeds = inputs_embeds[:, :hidden_size] + elif inputs_embeds.shape[1] < hidden_size: + # No need to handle this case for now + raise NotImplementedError + return self.get_text_features(input_ids, positions, inputs_embeds) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 72f9d15bc132..d7111d52dd8a 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -80,6 +80,7 @@ def __init__( # the draft model's hidden size can be different from the target model's # hidden size (e.g., Llama 3.3 70B). self.hidden_size = self.draft_model_config.get_hidden_size() + self.inputs_embeds_size = self.draft_model_config.get_inputs_embeds_size() # Multi-modal data support self.mm_registry = MULTIMODAL_REGISTRY @@ -151,7 +152,9 @@ def __init__( ) self.inputs_embeds = torch.zeros( - (self.max_num_tokens, self.hidden_size), dtype=self.dtype, device=device + (self.max_num_tokens, self.inputs_embeds_size), + dtype=self.dtype, + device=device, ) self.backup_next_token_ids = CpuGpuBuffer( diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py index 3f8ef03f9644..8ae887fe82cf 100644 --- a/vllm/v1/worker/gpu/input_batch.py +++ b/vllm/v1/worker/gpu/input_batch.py @@ -17,7 +17,7 @@ def __init__( self, max_num_reqs: int, max_num_tokens: int, - hidden_size: int, + inputs_embeds_size: int, vocab_size: int, dtype: torch.dtype, device: torch.device, diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index fdb930c4dcd7..9bf345053c30 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -98,7 +98,7 @@ def __init__( self.max_model_len = self.model_config.max_model_len self.max_num_tokens = self.scheduler_config.max_num_batched_tokens self.max_num_reqs = self.scheduler_config.max_num_seqs - self.hidden_size = self.model_config.get_hidden_size() + self.inputs_embeds_size = self.model_config.get_inputs_embeds_size() self.dp_size = self.parallel_config.data_parallel_size self.dp_rank = self.parallel_config.data_parallel_rank @@ -134,7 +134,7 @@ def __init__( self.input_buffers = InputBuffers( max_num_reqs=self.max_num_reqs, max_num_tokens=self.max_num_tokens, - hidden_size=self.hidden_size, + inputs_embeds_size=self.inputs_embeds_size, vocab_size=self.vocab_size, dtype=self.dtype, device=self.device, diff --git a/vllm/v1/worker/gpu/spec_decode/eagle.py b/vllm/v1/worker/gpu/spec_decode/eagle.py index a2d0550326f3..8848e220eb5b 100644 --- a/vllm/v1/worker/gpu/spec_decode/eagle.py +++ b/vllm/v1/worker/gpu/spec_decode/eagle.py @@ -44,6 +44,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): # the draft model's hidden size can be different from the target model's # hidden size (e.g., Llama 3.3 70B). self.hidden_size = self.draft_model_config.get_hidden_size() + self.inputs_embeds_size = self.draft_model_config.get_inputs_embeds_size() self.vocab_size = self.draft_model_config.get_vocab_size() self.pin_memory = is_pin_memory_available() self.dtype = vllm_config.model_config.dtype @@ -51,7 +52,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.input_buffers = InputBuffers( max_num_reqs=self.max_num_reqs, max_num_tokens=self.max_num_tokens, - hidden_size=self.hidden_size, + inputs_embeds_size=self.inputs_embeds_size, vocab_size=self.vocab_size, dtype=self.dtype, device=device, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9b0fb07297ac..b40e88681339 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -320,7 +320,7 @@ def __init__( # Model-related. self.num_query_heads = model_config.get_num_attention_heads(parallel_config) - self.hidden_size = model_config.get_hidden_size() + self.inputs_embeds_size = model_config.get_inputs_embeds_size() self.attention_chunk_size = model_config.attention_chunk_size # Only relevant for models using ALiBi (e.g, MPT) self.use_alibi = model_config.uses_alibi @@ -485,7 +485,7 @@ def __init__( # version of this tensor, avoid a RuntimeError by not creating a # numpy buffer. self.inputs_embeds = self._make_buffer( - self.max_num_tokens, self.hidden_size, dtype=self.dtype, numpy=False + self.max_num_tokens, self.inputs_embeds_size, dtype=self.dtype, numpy=False ) self.is_token_ids = self._make_buffer(self.max_num_tokens, dtype=torch.bool) self.discard_request_mask = self._make_buffer( diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 9c1fbfd24149..f3dd9aa96d2a 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -215,7 +215,7 @@ def __init__( self.num_query_heads = model_config.get_num_attention_heads(parallel_config) self.num_kv_heads = model_config.get_num_kv_heads(parallel_config) self.head_size = model_config.get_head_size() - self.hidden_size = model_config.get_hidden_size() + self.inputs_embeds_size = model_config.get_inputs_embeds_size() self.vocab_size = model_config.get_vocab_size() # Multi-modal data support @@ -1406,7 +1406,9 @@ def _dummy_run(self, num_tokens: int, num_reqs: int, num_blocks: int) -> None: if self.supports_mm_inputs: input_ids = None inputs_embeds = torch.zeros( - (num_tokens, self.hidden_size), dtype=self.dtype, device=self.device + (num_tokens, self.inputs_embeds_size), + dtype=self.dtype, + device=self.device, ) else: input_ids = torch.zeros((num_tokens), dtype=torch.int32).to(self.device)