Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion vllm/model_executor/models/aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,7 +620,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.config.image_token_index)
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/aya_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids=input_ids,
inputs_embeds=inputs_embeds,
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/blip2.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,7 +641,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
_IMAGE_TOKEN_ID)
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/chameleon.py
Original file line number Diff line number Diff line change
Expand Up @@ -1005,7 +1005,8 @@ def get_input_embeddings(
) -> torch.Tensor:

inputs_embeds = self.model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.model.vocabulary_mapping.image_token_id)
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/deepseek_vl2.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,7 +600,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.image_token_id)
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/florence2.py
Original file line number Diff line number Diff line change
Expand Up @@ -1046,7 +1046,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.pad_token_id)
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/fuyu.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids,
inputs_embeds,
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/gemma3_mm.py
Original file line number Diff line number Diff line change
Expand Up @@ -592,7 +592,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids,
inputs_embeds,
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/glm4v.py
Original file line number Diff line number Diff line change
Expand Up @@ -609,7 +609,8 @@ def get_input_embeddings(
) -> torch.Tensor:
inputs_embeds = self.transformer.get_input_embeddings(input_ids)

if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids=input_ids,
inputs_embeds=inputs_embeds,
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/granite_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,7 +721,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
"""Compute the merged LLM / audio embeddings."""
if multimodal_embeddings is None:
if multimodal_embeddings is None \
or len(multimodal_embeddings) == 0:
return self.language_model.get_input_embeddings(input_ids)

inputs_embeds = embed_multimodal(
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/idefics3.py
Original file line number Diff line number Diff line change
Expand Up @@ -720,7 +720,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids,
inputs_embeds,
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/internvl.py
Original file line number Diff line number Diff line change
Expand Up @@ -1336,7 +1336,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
context_token_ids = [
token_id for token_id in (self.img_context_token_id,
self.video_context_token_id)
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/kimi_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,8 @@ def get_input_embeddings(
# model as one of the requirements of basic vLLM model implementation.
inputs_embeds = self.language_model.get_input_embeddings(input_ids)

if multimodal_embeddings is not None:
if multimodal_embeddings is not None and len(
multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids=input_ids,
inputs_embeds=inputs_embeds,
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -683,7 +683,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids,
inputs_embeds,
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/llava_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,7 +502,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:

if not multimodal_embeddings:
if multimodal_embeddings is None \
or len(multimodal_embeddings) == 0:
return self.language_model.get_input_embeddings(input_ids)

inputs_embeds = embed_multimodal(
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/llava_next_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.config.video_token_index)
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/llava_onevision.py
Original file line number Diff line number Diff line change
Expand Up @@ -881,7 +881,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
[self.config.image_token_index, self.config.video_token_index])
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/minicpmv.py
Original file line number Diff line number Diff line change
Expand Up @@ -892,7 +892,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.llm.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
assert len(self.mm_token_ids) > 0
inputs_embeds = merge_multimodal_embeddings(
input_ids,
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/minimax_vl_01.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids,
inputs_embeds,
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/mistral3.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids,
inputs_embeds,
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/mllama4.py
Original file line number Diff line number Diff line change
Expand Up @@ -808,7 +808,8 @@ def get_input_embeddings(
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)

if multimodal_embeddings is not None:
if multimodal_embeddings is not None and len(
multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids,
inputs_embeds,
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/molmo.py
Original file line number Diff line number Diff line change
Expand Up @@ -1487,7 +1487,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
assert self.img_patch_id is not None

inputs_embeds = merge_multimodal_embeddings(
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/ovis.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,7 +515,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.llm.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.image_pad_token_id)
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/paligemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.config.image_token_index)
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/phi3v.py
Original file line number Diff line number Diff line change
Expand Up @@ -669,7 +669,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.embed_tokens(input_ids)
if multimodal_embeddings:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.image_token_id)
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/phi4mm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1148,7 +1148,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.model.embed_tokens(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None and len(
multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
[_IMAGE_PLACEHOLDER_TOKEN_ID, _AUDIO_PLACEHOLDER_TOKEN_ID])
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/pixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,7 +423,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids,
inputs_embeds,
Expand Down
5 changes: 3 additions & 2 deletions vllm/model_executor/models/qwen2_5_omni_thinker.py
Original file line number Diff line number Diff line change
Expand Up @@ -805,7 +805,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:

# TODO (ywang96): support overlapping modalitiy embeddings so that
# `use_audio_in_video` will work on V1.
Expand Down Expand Up @@ -845,7 +846,7 @@ def get_input_embeddings_v0(
multimodal_embeddings: Optional[NestedTensors] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is None:
if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
return inputs_embeds

for embeddings, modality in multimodal_embeddings:
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/qwen2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -1046,7 +1046,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
[self.config.image_token_id, self.config.video_token_id])
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/qwen2_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.config.audio_token_index)
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -1289,7 +1289,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
[self.config.image_token_id, self.config.video_token_id])
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/qwen_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -754,7 +754,8 @@ def get_input_embeddings(
) -> torch.Tensor:
inputs_embeds = self.transformer.get_input_embeddings(input_ids)

if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.transformer.visual.image_pad_id)
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/skyworkr1v.py
Original file line number Diff line number Diff line change
Expand Up @@ -883,7 +883,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
assert self.img_context_token_id is not None
self._set_visual_token_mask(input_ids)
inputs_embeds = merge_multimodal_embeddings(
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/tarsier.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,7 +598,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:
inputs_embeds = merge_multimodal_embeddings(
input_ids,
inputs_embeds,
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/ultravox.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,8 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings is not None \
and len(multimodal_embeddings) != 0:

# TODO(ywang96): remove this block after v0 is deprecated.
if not envs.VLLM_USE_V1:
Expand Down