sgl-project · zhyncs · Jul 19, 2025 · Jul 18, 2025 · Jul 18, 2025 · Jul 18, 2025
diff --git a/docs/supported_models/multimodal_language_models.md b/docs/supported_models/multimodal_language_models.md
@@ -37,5 +37,5 @@ in the GitHub search bar.
 | **Gemma 3 (Multimodal)**   | `google/gemma-3-4b-it`                     | `gemma-it`       | Gemma 3's larger models (4B, 12B, 27B) accept images (each image encoded as 256 tokens) alongside text in a combined 128K-token context.                                                                        |
 | **Kimi-VL** (A3B)          | `moonshotai/Kimi-VL-A3B-Instruct`          | `kimi-vl`        | Kimi-VL is a multimodal model that can understand and generate text from images.                                                                                                                                |
 | **Mistral-Small-3.1-24B**  | `mistralai/Mistral-Small-3.1-24B-Instruct-2503` | `mistral`   | Mistral 3.1 is a multimodal model that can generate text from text or images input. It also supports tool calling and structured output. |
-| **Phi-4-multimodal-instruct**  | `microsoft/Phi-4-multimodal-instruct` | `phi-4-mm`   | Phi-4-multimodal-instruct is the multimodal variant of the Phi-4-mini model, enhanced with LoRA for improved multimodal capabilities. Currently, it supports only text and vision modalities in SGLang. |
+| **Phi-4-multimodal-instruct**  | `microsoft/Phi-4-multimodal-instruct` | `phi-4-mm`   | Phi-4-multimodal-instruct is the multimodal variant of the Phi-4-mini model, enhanced with LoRA for improved multimodal capabilities. It supports text, vision and audio modalities in SGLang. |
 | **MiMo-VL** (7B)           | `XiaomiMiMo/MiMo-VL-7B-RL`                 | `mimo-vl`        | Xiaomi's compact yet powerful vision-language model featuring a native resolution ViT encoder for fine-grained visual details, an MLP projector for cross-modal alignment, and the MiMo-7B language model optimized for complex reasoning tasks. |
diff --git a/python/sglang/srt/conversation.py b/python/sglang/srt/conversation.py
@@ -729,6 +729,7 @@ def generate_chat_conv(
         sep="<|end|>",
         stop_str="<|end|>",
         image_token="<|endoftext10|>",
+        audio_token="<|endoftext11|>",
     )
 )
 

@@ -239,6 +239,10 @@ class MultimodalDataItem:
     # For gemma3n
     input_features_mask: Optional[torch.Tensor] = None
 
+    # For phi4-mm
+    image_attention_mask: Optional[torch.Tensor] = None
+    audio_attention_mask: Optional[torch.Tensor] = None
+
     @staticmethod
     def is_empty_list(l):
         if l is None:

diff --git a/python/sglang/srt/models/phi4mm.py b/python/sglang/srt/models/phi4mm.py
@@ -40,6 +40,7 @@
 from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.models.idefics2 import Idefics2VisionTransformer
 from sglang.srt.models.llama import LlamaForCausalLM
+from sglang.srt.models.phi4mm_audio import AudioEmbedding
 
 logger = logging.getLogger(__name__)
 
@@ -420,16 +421,49 @@ def __init__(
             model_dir=config._name_or_path,
         )
 
+        if isinstance(config.embd_layer["audio_embd_layer"], dict):
+            embedding_config = {
+                "embedding_cls": config.embd_layer["audio_embd_layer"]["embedding_cls"],
+                **config.embd_layer["audio_embd_layer"],
+            }
+        else:
+            embedding_config = {"embedding_cls": config.embd_layer["embedding_cls"]}
+
+        self.embed_tokens_extend = AudioEmbedding(config, **embedding_config)
+
     def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
         dtype = next(self.vision_encoder.parameters()).dtype
         pixel_values = torch.cat([item.feature for item in items], dim=0).type(dtype)
-        image_attention_mask = torch.cat([item.image_emb_mask for item in items], dim=0)
+        image_attention_mask = torch.cat(
+            [item.image_attention_mask for item in items], dim=0
+        )
         image_sizes = torch.cat([item.image_sizes for item in items], dim=0)
         image_embeds = self.vision_encoder(
             pixel_values, image_sizes, image_attention_mask
         )
         return torch.cat(image_embeds).type(dtype)
 
+    def get_audio_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        # (e.g. multiple examples) and the second dim is the multi-audio dim
+        # (e.g. multiple audios in the same example)
+        embed_tokens_extend_param = next(self.embed_tokens_extend.parameters())
+        device = embed_tokens_extend_param.device
+        dtype = embed_tokens_extend_param.dtype
+        audio_embeds = [
+            self.embed_tokens_extend(
+                # item.feature: (num_audios_in_a_sequence, T, D)
+                # item.audio_attention_mask: (num_audios_in_a_sequence, T, D) BoolTensor or None
+                audio_features=item.feature.to(device).type(dtype),
+                audio_attention_mask=(
+                    item.audio_attention_mask.to(device)
+                    if item.audio_attention_mask is not None
+                    else None
+                ),
+            )
+            for item in items
+        ]
+        return torch.cat(audio_embeds).type(dtype)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -443,6 +477,7 @@ def forward(
             language_model=self.language_model,
             data_embedding_funcs={
                 Modality.IMAGE: self.get_image_feature,
+                Modality.AUDIO: self.get_audio_feature,
             },
             positions=positions,
         )
@@ -464,6 +499,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             (".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
         ]
         prefix_mapping = {
+            "model.embed_tokens_extend.audio_embed.audio_projection.vision.": "embed_tokens_extend.audio_projection_for_vision.",
+            "model.embed_tokens_extend.audio_embed.audio_projection.speech.": "embed_tokens_extend.audio_projection.",
+            "model.embed_tokens_extend.audio_embed.": "embed_tokens_extend.",
             "model.embed_tokens_extend.image_embed.": "vision_encoder.",
             "model.": "language_model.model.",
         }
@@ -472,7 +510,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             "img_processor.encoder.layers.26",
             "img_processor.head",
             "img_processor.post_layernorm",
-            "audio",
         ]
 
         def _should_skip(name: str) -> bool: