Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion tests/models/multimodal/pooling/test_siglip.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,12 @@
}
)

MODELS = ["google/siglip-base-patch16-224", "google/siglip2-base-patch16-224"]
MODELS = [
"google/siglip-base-patch16-224",
"google/siglip2-base-patch16-224",
# Different image embedding dim than text_config.hidden_size
"google/siglip2-giant-opt-patch16-384",
]


def _run_test(
Expand Down
10 changes: 10 additions & 0 deletions vllm/config/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -1202,6 +1202,16 @@ def get_vocab_size(self) -> int:
def get_hidden_size(self) -> int:
return getattr(self.hf_text_config, "hidden_size", 0)

def get_inputs_embeds_size(self) -> int:
# in most cases the size of inputs_embeds is identical to the size
# of the hidden states, however there are exceptions, for example
# embedding models like CLIP and SigLIP
for target_attr in ("inputs_embeds_size", "projection_dim", "projection_size"):
if hasattr(self.hf_text_config, target_attr):
return getattr(self.hf_text_config, target_attr)
Comment thread
DarkLight1337 marked this conversation as resolved.

return self.get_hidden_size()

@property
def is_deepseek_mla(self) -> bool:
if not hasattr(self.hf_text_config, "model_type"):
Expand Down
5 changes: 4 additions & 1 deletion vllm/v1/spec_decode/eagle.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def __init__(
# the draft model's hidden size can be different from the target model's
# hidden size (e.g., Llama 3.3 70B).
self.hidden_size = self.draft_model_config.get_hidden_size()
self.inputs_embeds_size = self.draft_model_config.get_inputs_embeds_size()

# Multi-modal data support
self.mm_registry = MULTIMODAL_REGISTRY
Expand Down Expand Up @@ -151,7 +152,9 @@ def __init__(
)

self.inputs_embeds = torch.zeros(
(self.max_num_tokens, self.hidden_size), dtype=self.dtype, device=device
(self.max_num_tokens, self.inputs_embeds_size),
dtype=self.dtype,
device=device,
)

self.backup_next_token_ids = CpuGpuBuffer(
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/worker/gpu/input_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def __init__(
self,
max_num_reqs: int,
max_num_tokens: int,
hidden_size: int,
inputs_embeds_size: int,
vocab_size: int,
dtype: torch.dtype,
device: torch.device,
Expand Down
4 changes: 2 additions & 2 deletions vllm/v1/worker/gpu/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def __init__(
self.max_model_len = self.model_config.max_model_len
self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
self.max_num_reqs = self.scheduler_config.max_num_seqs
self.hidden_size = self.model_config.get_hidden_size()
self.inputs_embeds_size = self.model_config.get_inputs_embeds_size()

self.dp_size = self.parallel_config.data_parallel_size
self.dp_rank = self.parallel_config.data_parallel_rank
Expand Down Expand Up @@ -134,7 +134,7 @@ def __init__(
self.input_buffers = InputBuffers(
max_num_reqs=self.max_num_reqs,
max_num_tokens=self.max_num_tokens,
hidden_size=self.hidden_size,
inputs_embeds_size=self.inputs_embeds_size,
vocab_size=self.vocab_size,
dtype=self.dtype,
device=self.device,
Expand Down
3 changes: 2 additions & 1 deletion vllm/v1/worker/gpu/spec_decode/eagle.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,15 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
# the draft model's hidden size can be different from the target model's
# hidden size (e.g., Llama 3.3 70B).
self.hidden_size = self.draft_model_config.get_hidden_size()
self.inputs_embeds_size = self.draft_model_config.get_inputs_embeds_size()
self.vocab_size = self.draft_model_config.get_vocab_size()
self.pin_memory = is_pin_memory_available()
self.dtype = vllm_config.model_config.dtype

self.input_buffers = InputBuffers(
max_num_reqs=self.max_num_reqs,
max_num_tokens=self.max_num_tokens,
hidden_size=self.hidden_size,
inputs_embeds_size=self.inputs_embeds_size,
vocab_size=self.vocab_size,
dtype=self.dtype,
device=device,
Expand Down
4 changes: 2 additions & 2 deletions vllm/v1/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ def __init__(

# Model-related.
self.num_query_heads = model_config.get_num_attention_heads(parallel_config)
self.hidden_size = model_config.get_hidden_size()
self.inputs_embeds_size = model_config.get_inputs_embeds_size()
self.attention_chunk_size = model_config.attention_chunk_size
# Only relevant for models using ALiBi (e.g, MPT)
self.use_alibi = model_config.uses_alibi
Expand Down Expand Up @@ -485,7 +485,7 @@ def __init__(
# version of this tensor, avoid a RuntimeError by not creating a
# numpy buffer.
self.inputs_embeds = self._make_buffer(
self.max_num_tokens, self.hidden_size, dtype=self.dtype, numpy=False
self.max_num_tokens, self.inputs_embeds_size, dtype=self.dtype, numpy=False
)
self.is_token_ids = self._make_buffer(self.max_num_tokens, dtype=torch.bool)
self.discard_request_mask = self._make_buffer(
Expand Down
6 changes: 4 additions & 2 deletions vllm/v1/worker/tpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ def __init__(
self.num_query_heads = model_config.get_num_attention_heads(parallel_config)
self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
self.head_size = model_config.get_head_size()
self.hidden_size = model_config.get_hidden_size()
self.inputs_embeds_size = model_config.get_inputs_embeds_size()
self.vocab_size = model_config.get_vocab_size()

# Multi-modal data support
Expand Down Expand Up @@ -1406,7 +1406,9 @@ def _dummy_run(self, num_tokens: int, num_reqs: int, num_blocks: int) -> None:
if self.supports_mm_inputs:
input_ids = None
inputs_embeds = torch.zeros(
(num_tokens, self.hidden_size), dtype=self.dtype, device=self.device
(num_tokens, self.inputs_embeds_size),
dtype=self.dtype,
device=self.device,
)
else:
input_ids = torch.zeros((num_tokens), dtype=torch.int32).to(self.device)
Expand Down