Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/supported_models/multimodal_language_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,5 @@ in the GitHub search bar.
| **Gemma 3 (Multimodal)** | `google/gemma-3-4b-it` | `gemma-it` | Gemma 3's larger models (4B, 12B, 27B) accept images (each image encoded as 256 tokens) alongside text in a combined 128K-token context. |
| **Kimi-VL** (A3B) | `moonshotai/Kimi-VL-A3B-Instruct` | `kimi-vl` | Kimi-VL is a multimodal model that can understand and generate text from images. |
| **Mistral-Small-3.1-24B** | `mistralai/Mistral-Small-3.1-24B-Instruct-2503` | `mistral` | Mistral 3.1 is a multimodal model that can generate text from text or images input. It also supports tool calling and structured output. |
| **Phi-4-multimodal-instruct** | `microsoft/Phi-4-multimodal-instruct` | `phi-4-mm` | Phi-4-multimodal-instruct is the multimodal variant of the Phi-4-mini model, enhanced with LoRA for improved multimodal capabilities. Currently, it supports only text and vision modalities in SGLang. |
| **Phi-4-multimodal-instruct** | `microsoft/Phi-4-multimodal-instruct` | `phi-4-mm` | Phi-4-multimodal-instruct is the multimodal variant of the Phi-4-mini model, enhanced with LoRA for improved multimodal capabilities. It supports text, vision and audio modalities in SGLang. |
| **MiMo-VL** (7B) | `XiaomiMiMo/MiMo-VL-7B-RL` | `mimo-vl` | Xiaomi's compact yet powerful vision-language model featuring a native resolution ViT encoder for fine-grained visual details, an MLP projector for cross-modal alignment, and the MiMo-7B language model optimized for complex reasoning tasks. |
1 change: 1 addition & 0 deletions python/sglang/srt/conversation.py
Original file line number Diff line number Diff line change
Expand Up @@ -729,6 +729,7 @@ def generate_chat_conv(
sep="<|end|>",
stop_str="<|end|>",
image_token="<|endoftext10|>",
audio_token="<|endoftext11|>",
)
)

Expand Down
4 changes: 4 additions & 0 deletions python/sglang/srt/managers/schedule_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,10 @@ class MultimodalDataItem:
# For gemma3n
input_features_mask: Optional[torch.Tensor] = None

# For phi4-mm
image_attention_mask: Optional[torch.Tensor] = None
audio_attention_mask: Optional[torch.Tensor] = None

@staticmethod
def is_empty_list(l):
if l is None:
Expand Down
41 changes: 39 additions & 2 deletions python/sglang/srt/models/phi4mm.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
from sglang.srt.model_loader.weight_utils import default_weight_loader
from sglang.srt.models.idefics2 import Idefics2VisionTransformer
from sglang.srt.models.llama import LlamaForCausalLM
from sglang.srt.models.phi4mm_audio import AudioEmbedding

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -420,16 +421,49 @@ def __init__(
model_dir=config._name_or_path,
)

if isinstance(config.embd_layer["audio_embd_layer"], dict):
embedding_config = {
"embedding_cls": config.embd_layer["audio_embd_layer"]["embedding_cls"],
**config.embd_layer["audio_embd_layer"],
}
else:
embedding_config = {"embedding_cls": config.embd_layer["embedding_cls"]}

self.embed_tokens_extend = AudioEmbedding(config, **embedding_config)

def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
dtype = next(self.vision_encoder.parameters()).dtype
pixel_values = torch.cat([item.feature for item in items], dim=0).type(dtype)
image_attention_mask = torch.cat([item.image_emb_mask for item in items], dim=0)
image_attention_mask = torch.cat(
[item.image_attention_mask for item in items], dim=0
)
image_sizes = torch.cat([item.image_sizes for item in items], dim=0)
image_embeds = self.vision_encoder(
pixel_values, image_sizes, image_attention_mask
)
return torch.cat(image_embeds).type(dtype)

def get_audio_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
# (e.g. multiple examples) and the second dim is the multi-audio dim
# (e.g. multiple audios in the same example)
embed_tokens_extend_param = next(self.embed_tokens_extend.parameters())
device = embed_tokens_extend_param.device
dtype = embed_tokens_extend_param.dtype
audio_embeds = [
self.embed_tokens_extend(
# item.feature: (num_audios_in_a_sequence, T, D)
# item.audio_attention_mask: (num_audios_in_a_sequence, T, D) BoolTensor or None
audio_features=item.feature.to(device).type(dtype),
audio_attention_mask=(
item.audio_attention_mask.to(device)
if item.audio_attention_mask is not None
else None
),
)
for item in items
]
return torch.cat(audio_embeds).type(dtype)

def forward(
self,
input_ids: torch.Tensor,
Expand All @@ -443,6 +477,7 @@ def forward(
language_model=self.language_model,
data_embedding_funcs={
Modality.IMAGE: self.get_image_feature,
Modality.AUDIO: self.get_audio_feature,
},
positions=positions,
)
Expand All @@ -464,6 +499,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
(".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
]
prefix_mapping = {
"model.embed_tokens_extend.audio_embed.audio_projection.vision.": "embed_tokens_extend.audio_projection_for_vision.",
"model.embed_tokens_extend.audio_embed.audio_projection.speech.": "embed_tokens_extend.audio_projection.",
"model.embed_tokens_extend.audio_embed.": "embed_tokens_extend.",
"model.embed_tokens_extend.image_embed.": "vision_encoder.",
"model.": "language_model.model.",
}
Expand All @@ -472,7 +510,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
"img_processor.encoder.layers.26",
"img_processor.head",
"img_processor.post_layernorm",
"audio",
]

def _should_skip(name: str) -> bool:
Expand Down
Loading
Loading