Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion tests/models/multimodal/pooling/test_siglip.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,12 @@
}
)

MODELS = ["google/siglip-base-patch16-224", "google/siglip2-base-patch16-224"]
MODELS = [
"google/siglip-base-patch16-224",
"google/siglip2-base-patch16-224",
# Different image embedding dim than text_config.hidden_size
"google/siglip2-giant-opt-patch16-384",
]


def _run_test(
Expand Down
10 changes: 10 additions & 0 deletions vllm/config/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -1202,6 +1202,16 @@ def get_vocab_size(self) -> int:
def get_hidden_size(self) -> int:
return getattr(self.hf_text_config, "hidden_size", 0)

def get_inputs_embeds_size(self) -> int:
# The size of inputs_embeds is usually identical to the size
# of the hidden states, however there are exceptions, such as
# embedding models like CLIP and SigLIP
for target_attr in ("projection_dim", "projection_size"):
if hasattr(self.hf_text_config, target_attr):
return getattr(self.hf_text_config, target_attr)

return self.get_hidden_size()

@property
def is_deepseek_mla(self) -> bool:
if not hasattr(self.hf_text_config, "model_type"):
Expand Down
51 changes: 46 additions & 5 deletions vllm/model_executor/models/clip.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable, Mapping, Sequence
from collections.abc import Callable, Iterable, Mapping, Sequence
from functools import cached_property
from typing import Annotated, Literal

Expand Down Expand Up @@ -903,6 +903,41 @@ def _process_image_inputs(self, inputs: CLIPImagePixelInputs) -> torch.Tensor:
def get_language_model(self) -> torch.nn.Module:
return self.text_model

def _embed_text_input_ids(
self,
input_ids: torch.Tensor,
embed_input_ids: Callable[[torch.Tensor], torch.Tensor],
*,
is_multimodal: torch.Tensor | None,
handle_oov_mm_token: bool,
) -> torch.Tensor:
inputs_embeds = super()._embed_text_input_ids(
input_ids,
embed_input_ids,
is_multimodal=is_multimodal,
handle_oov_mm_token=handle_oov_mm_token,
)

# NOTE: inputs_embeds in model runner has size text_config.projection_dim
# (instead of text_config.hidden_size) to accommodate image embeddings
inputs_embeds_size = self.projection_dim
if inputs_embeds.shape[1] < inputs_embeds_size:
inputs_embeds = torch.cat(
[
inputs_embeds,
inputs_embeds.new_empty(
inputs_embeds.shape[0],
inputs_embeds_size - inputs_embeds.shape[1],
),
],
dim=1,
)
elif inputs_embeds.shape[1] > inputs_embeds_size:
# No need to handle this case for now
raise NotImplementedError

return inputs_embeds

def embed_input_ids(
self,
input_ids: torch.Tensor,
Expand Down Expand Up @@ -949,10 +984,16 @@ def forward(
if not self._is_text_input:
return inputs_embeds

# Text inputs
return self.get_text_features(
input_ids=input_ids, position_ids=positions, inputs_embeds=inputs_embeds
)
# NOTE: inputs_embeds in model runner has size text_config.projection_dim
# (instead of text_config.hidden_size) to accommodate image embeddings
hidden_size = self.text_embed_dim
if inputs_embeds.shape[1] > hidden_size:
inputs_embeds = inputs_embeds[:, :hidden_size]
elif inputs_embeds.shape[1] < hidden_size:
# No need to handle this case for now
raise NotImplementedError

return self.get_text_features(input_ids, positions, inputs_embeds)

def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
loader = AutoWeightsLoader(
Expand Down
49 changes: 46 additions & 3 deletions vllm/model_executor/models/siglip.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Implementation of SiglipVisionModel intended to be only used
within a vision language model."""

import math
from collections.abc import Iterable, Mapping
from collections.abc import Callable, Iterable, Mapping
from functools import cached_property
from typing import Annotated, Literal

Expand Down Expand Up @@ -976,6 +974,7 @@ def forward(

position_embeddings = self.position_embedding(position_ids)
embeddings = inputs_embeds + position_embeddings

return embeddings


Expand Down Expand Up @@ -1145,6 +1144,41 @@ def _process_image_inputs(self, inputs: SiglipImagePixelInputs) -> torch.Tensor:
def get_language_model(self) -> torch.nn.Module:
return self.text_model

def _embed_text_input_ids(
self,
input_ids: torch.Tensor,
embed_input_ids: Callable[[torch.Tensor], torch.Tensor],
*,
is_multimodal: torch.Tensor | None,
handle_oov_mm_token: bool,
) -> torch.Tensor:
inputs_embeds = super()._embed_text_input_ids(
input_ids,
embed_input_ids,
is_multimodal=is_multimodal,
handle_oov_mm_token=handle_oov_mm_token,
)

# NOTE: inputs_embeds in model runner has size text_config.projection_size
# (instead of text_config.hidden_size) to accommodate image embeddings
inputs_embeds_size = self.text_projection_size
if inputs_embeds.shape[1] < inputs_embeds_size:
inputs_embeds = torch.cat(
[
inputs_embeds,
inputs_embeds.new_empty(
inputs_embeds.shape[0],
inputs_embeds_size - inputs_embeds.shape[1],
),
],
dim=1,
)
elif inputs_embeds.shape[1] > inputs_embeds_size:
# No need to handle this case for now
raise NotImplementedError

return inputs_embeds

def embed_input_ids(
self,
input_ids: torch.Tensor,
Expand Down Expand Up @@ -1190,6 +1224,15 @@ def forward(
if not self._is_text_input:
return inputs_embeds

# NOTE: inputs_embeds in model runner has size text_config.projection_size
# (instead of text_config.hidden_size) to accommodate image embeddings
hidden_size = self.text_embed_dim
if inputs_embeds.shape[1] > hidden_size:
inputs_embeds = inputs_embeds[:, :hidden_size]
elif inputs_embeds.shape[1] < hidden_size:
# No need to handle this case for now
raise NotImplementedError

return self.get_text_features(input_ids, positions, inputs_embeds)

def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
Expand Down
5 changes: 4 additions & 1 deletion vllm/v1/spec_decode/eagle.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def __init__(
# the draft model's hidden size can be different from the target model's
# hidden size (e.g., Llama 3.3 70B).
self.hidden_size = self.draft_model_config.get_hidden_size()
self.inputs_embeds_size = self.draft_model_config.get_inputs_embeds_size()

# Multi-modal data support
self.mm_registry = MULTIMODAL_REGISTRY
Expand Down Expand Up @@ -151,7 +152,9 @@ def __init__(
)

self.inputs_embeds = torch.zeros(
(self.max_num_tokens, self.hidden_size), dtype=self.dtype, device=device
(self.max_num_tokens, self.inputs_embeds_size),
dtype=self.dtype,
device=device,
)

self.backup_next_token_ids = CpuGpuBuffer(
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/worker/gpu/input_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def __init__(
self,
max_num_reqs: int,
max_num_tokens: int,
hidden_size: int,
inputs_embeds_size: int,
vocab_size: int,
dtype: torch.dtype,
device: torch.device,
Expand Down
4 changes: 2 additions & 2 deletions vllm/v1/worker/gpu/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def __init__(
self.max_model_len = self.model_config.max_model_len
self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
self.max_num_reqs = self.scheduler_config.max_num_seqs
self.hidden_size = self.model_config.get_hidden_size()
self.inputs_embeds_size = self.model_config.get_inputs_embeds_size()

self.dp_size = self.parallel_config.data_parallel_size
self.dp_rank = self.parallel_config.data_parallel_rank
Expand Down Expand Up @@ -134,7 +134,7 @@ def __init__(
self.input_buffers = InputBuffers(
max_num_reqs=self.max_num_reqs,
max_num_tokens=self.max_num_tokens,
hidden_size=self.hidden_size,
inputs_embeds_size=self.inputs_embeds_size,
vocab_size=self.vocab_size,
dtype=self.dtype,
device=self.device,
Expand Down
3 changes: 2 additions & 1 deletion vllm/v1/worker/gpu/spec_decode/eagle.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,15 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
# the draft model's hidden size can be different from the target model's
# hidden size (e.g., Llama 3.3 70B).
self.hidden_size = self.draft_model_config.get_hidden_size()
self.inputs_embeds_size = self.draft_model_config.get_inputs_embeds_size()
self.vocab_size = self.draft_model_config.get_vocab_size()
self.pin_memory = is_pin_memory_available()
self.dtype = vllm_config.model_config.dtype

self.input_buffers = InputBuffers(
max_num_reqs=self.max_num_reqs,
max_num_tokens=self.max_num_tokens,
hidden_size=self.hidden_size,
inputs_embeds_size=self.inputs_embeds_size,
vocab_size=self.vocab_size,
dtype=self.dtype,
device=device,
Expand Down
4 changes: 2 additions & 2 deletions vllm/v1/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ def __init__(

# Model-related.
self.num_query_heads = model_config.get_num_attention_heads(parallel_config)
self.hidden_size = model_config.get_hidden_size()
self.inputs_embeds_size = model_config.get_inputs_embeds_size()
self.attention_chunk_size = model_config.attention_chunk_size
# Only relevant for models using ALiBi (e.g, MPT)
self.use_alibi = model_config.uses_alibi
Expand Down Expand Up @@ -485,7 +485,7 @@ def __init__(
# version of this tensor, avoid a RuntimeError by not creating a
# numpy buffer.
self.inputs_embeds = self._make_buffer(
self.max_num_tokens, self.hidden_size, dtype=self.dtype, numpy=False
self.max_num_tokens, self.inputs_embeds_size, dtype=self.dtype, numpy=False
)
self.is_token_ids = self._make_buffer(self.max_num_tokens, dtype=torch.bool)
self.discard_request_mask = self._make_buffer(
Expand Down
6 changes: 4 additions & 2 deletions vllm/v1/worker/tpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ def __init__(
self.num_query_heads = model_config.get_num_attention_heads(parallel_config)
self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
self.head_size = model_config.get_head_size()
self.hidden_size = model_config.get_hidden_size()
self.inputs_embeds_size = model_config.get_inputs_embeds_size()
self.vocab_size = model_config.get_vocab_size()

# Multi-modal data support
Expand Down Expand Up @@ -1406,7 +1406,9 @@ def _dummy_run(self, num_tokens: int, num_reqs: int, num_blocks: int) -> None:
if self.supports_mm_inputs:
input_ids = None
inputs_embeds = torch.zeros(
(num_tokens, self.hidden_size), dtype=self.dtype, device=self.device
(num_tokens, self.inputs_embeds_size),
dtype=self.dtype,
device=self.device,
)
else:
input_ids = torch.zeros((num_tokens), dtype=torch.int32).to(self.device)
Expand Down