sgl-project · mickqian · Feb 9, 2026 · Jan 21, 2026 · Jan 22, 2026 · Jan 22, 2026
diff --git a/benchmark/kernels/fused_moe_triton/common_utils.py b/benchmark/kernels/fused_moe_triton/common_utils.py
@@ -66,6 +66,7 @@ def get_model_config(
         "Qwen3MoeForCausalLM",
         "Qwen3NextForCausalLM",
         "Qwen3VLMoeForConditionalGeneration",
+        "Qwen3_5MoeForConditionalGeneration",
     ]:
         E = config.num_experts // ep_size
         topk = config.num_experts_per_tok

diff --git a/python/sglang/srt/configs/__init__.py b/python/sglang/srt/configs/__init__.py
@@ -18,6 +18,7 @@
 from sglang.srt.configs.nano_nemotron_vl import NemotronH_Nano_VL_V2_Config
 from sglang.srt.configs.nemotron_h import NemotronHConfig
 from sglang.srt.configs.olmo3 import Olmo3Config
+from sglang.srt.configs.qwen3_5 import Qwen3_5Config, Qwen3_5MoeConfig
 from sglang.srt.configs.qwen3_next import Qwen3NextConfig
 from sglang.srt.configs.step3_vl import (
     Step3TextConfig,
@@ -43,6 +44,8 @@
     "KimiLinearConfig",
     "KimiK25Config",
     "Qwen3NextConfig",
+    "Qwen3_5Config",
+    "Qwen3_5MoeConfig",
     "DotsVLMConfig",
     "DotsOCRConfig",
     "FalconH1Config",

diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
@@ -319,6 +319,13 @@ def _config_draft_model(self):
             self.hf_config.architectures[0] = "Qwen3NextForCausalLMMTP"
             self.hf_config.num_nextn_predict_layers = 1
 
+        if is_draft_model and self.hf_config.architectures[0] in [
+            "Qwen3_5ForConditionalGeneration",
+            "Qwen3_5MoeForConditionalGeneration",
+        ]:
+            self.hf_config.architectures[0] = "Qwen3_5ForCausalLMMTP"
+            self.hf_config.num_nextn_predict_layers = 1
+
         if is_draft_model and self.hf_config.architectures[0] == "ExaoneMoEForCausalLM":
             self.hf_config.architectures[0] = "ExaoneMoEForCausalLMMTP"
             self.hf_config.num_nextn_predict_layers = 1
@@ -1193,6 +1200,8 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
     "Qwen2_5_VLForConditionalGeneration",
     "Qwen3VLForConditionalGeneration",
     "Qwen3VLMoeForConditionalGeneration",
+    "Qwen3_5ForConditionalGeneration",
+    "Qwen3_5MoeForConditionalGeneration",
     "Qwen3OmniMoeForConditionalGeneration",
     "KimiVLForConditionalGeneration",
     "InternVLChatModel",

diff --git a/python/sglang/srt/configs/qwen3_5.py b/python/sglang/srt/configs/qwen3_5.py
@@ -0,0 +1,113 @@
+from transformers import PretrainedConfig
+
+from sglang.srt.configs.qwen3_next import Qwen3NextConfig
+from sglang.srt.configs.qwen3_vl import Qwen3VLVisionConfig
+
+
+class Qwen3_5VisionConfig(Qwen3VLVisionConfig):
+    model_type = "qwen3_5"
+    base_config_key = "vision_config"
+
+
+class Qwen3_5TextConfig(Qwen3NextConfig):
+    model_type = "qwen3_5_text"
+    base_config_key = "text_config"
+
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if self.rope_scaling is None:
+            self.rope_scaling = {}
+
+
+class Qwen3_5Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3_5Model`]. It is used to instantiate a
+    Qwen3.5 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen3.5.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3_5TextConfig`):
+            The config object or dictionary of the text backbone.
+        vision_config (`Union[PreTrainedConfig, dict]`,  *optional*, defaults to `Qwen3_5VisionConfig`):
+            The config object or dictionary of the vision backbone.
+        image_token_id (`int`, *optional*, defaults to 151655):
+            The image token index to encode the image prompt.
+        video_token_id (`int`, *optional*, defaults to 151656):
+            The video token index to encode the image prompt.
+        vision_start_token_id (`int`, *optional*, defaults to 151652):
+            The start token index to encode the image prompt.
+        vision_end_token_id (`int`, *optional*, defaults to 151653):
+            The end token index to encode the image prompt.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie the word embeddings.
+
+    ```python
+    >>> from transformers import Qwen3_5ForConditionalGeneration, Qwen3_5Config
+
+    >>> # Initializing a Qwen3.5 style configuration
+    >>> configuration = Qwen3_5Config()
+
+    >>> # Initializing a model from the Qwen3.5 style configuration
+    >>> model = Qwen3_5ForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen3_5"
+    sub_configs = {
+        "vision_config": Qwen3_5VisionConfig,
+        "text_config": Qwen3_5TextConfig,
+    }
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        image_token_id=151655,
+        video_token_id=151656,
+        vision_start_token_id=151652,
+        vision_end_token_id=151653,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+
+        if isinstance(text_config, dict):
+            self.text_config = self.sub_configs["text_config"](**text_config)
+        elif text_config is None:
+            self.text_config = self.sub_configs["text_config"]()
+
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.vision_start_token_id = vision_start_token_id
+        self.vision_end_token_id = vision_end_token_id
+        super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
+
+
+class Qwen3_5MoeVisionConfig(Qwen3_5VisionConfig):
+    model_type = "qwen3_5_moe"
+
+
+class Qwen3_5MoeTextConfig(Qwen3_5TextConfig):
+    model_type = "qwen3_5_moe_text"
+
+
+class Qwen3_5MoeConfig(Qwen3_5Config):
+    model_type = "qwen3_5_moe"
+    sub_configs = {
+        "vision_config": Qwen3_5MoeVisionConfig,
+        "text_config": Qwen3_5MoeTextConfig,
+    }
@@ -104,6 +104,8 @@ class LogitsProcessorOutput:
     ## Part 5: Customized Info
     customized_info: Optional[Dict[str, List[Any]]] = None
 
+    mm_input_embeds: Optional[torch.Tensor] = None
+
 
 @dataclasses.dataclass
 class LogitsMetadata:
@@ -146,6 +148,8 @@ class LogitsMetadata:
     # Whether this batch is prefill-only (no token generation needed)
     is_prefill_only: bool = False
 
+    mm_input_embeds: Optional[torch.Tensor] = None
+
     @classmethod
     def from_forward_batch(cls, forward_batch: ForwardBatch):
         if (
@@ -196,6 +200,7 @@ def from_forward_batch(cls, forward_batch: ForwardBatch):
             global_num_tokens_for_logprob_cpu=forward_batch.global_num_tokens_for_logprob_cpu,
             global_num_tokens_for_logprob_gpu=forward_batch.global_num_tokens_for_logprob_gpu,
             dp_padding_mode=DpPaddingMode.SUM_LEN,
+            mm_input_embeds=forward_batch.mm_input_embeds,
         )
 
     def compute_dp_attention_metadata(self):
@@ -341,6 +346,7 @@ def forward(
             return LogitsProcessorOutput(
                 next_token_logits=sampled_logits,
                 hidden_states=hidden_states_to_store,
+                mm_input_embeds=logits_metadata.mm_input_embeds,
             )
 
         # Start to process input logprobs
@@ -386,6 +392,7 @@ def forward(
             input_top_logprobs_idx=logprobs_result.input_top_logprobs_idx,
             input_token_ids_logprobs_val=logprobs_result.input_token_ids_logprobs_val,
             input_token_ids_logprobs_idx=logprobs_result.input_token_ids_logprobs_idx,
+            mm_input_embeds=logits_metadata.mm_input_embeds,
         )
 
     def _get_pruned_states(
@@ -1067,6 +1074,10 @@ def compute_logprobs_for_multi_item_scoring(
             input_top_logprobs_idx=input_top_logprobs_idx,
             input_token_ids_logprobs_val=input_token_ids_logprobs_val,
             input_token_ids_logprobs_idx=input_token_ids_logprobs_idx,
+            # FIXME: These fields are not logits-related but are passed through here as a
+            # workaround since ForwardBatch is local to forward_batch_generation().
+            # They should be moved to GenerationBatchResult to keep this class clean.
+            mm_input_embeds=logits_metadata.mm_input_embeds,
         )
 
 

@@ -1822,7 +1822,9 @@ def get_rope_index(
                 **kwargs,
             )
         if (
-            model_type.startswith("qwen3_vl") or model_type.startswith("qwen3_vl_moe")
+            model_type.startswith("qwen3_vl")
+            or model_type.startswith("qwen3_vl_moe")
+            or model_type.startswith("qwen3_5")
         ) and video_grid_thw is not None:
             video_grid_thw = torch.repeat_interleave(
                 video_grid_thw, video_grid_thw[:, 0], dim=0
@@ -1922,6 +1924,8 @@ def get_rope_index(
                         "qwen2_vl",
                         "qwen3_vl",
                         "qwen3_vl_moe",
+                        "qwen3_5",
+                        "qwen3_5_moe",
                     ):
                         t_index = (
                             torch.arange(llm_grid_t, device=position_ids.device)

@@ -1121,6 +1121,7 @@ def general_mm_embed_routine(
                             if isinstance(feature, torch.Tensor) and feature.is_cuda:
                                 mm_item.feature = feature.to("cpu", non_blocking=True)
             forward_batch.mm_inputs = None
+            forward_batch.mm_input_embeds = input_embeds
         else:
             input_embeds = embed_tokens(input_ids)
         # Copy to pre-allocated buffer if available (for CUDA graph address stability)

@@ -350,6 +350,7 @@ class ForwardBatch(ForwardBatchDeepSeekMHAMixin):
     # Speculative decoding
     spec_info: Optional[SpecInput] = None
     spec_algorithm: SpeculativeAlgorithm = None
+    mm_input_embeds: Optional[torch.Tensor] = None
     capture_hidden_mode: CaptureHiddenMode = None
 
     # For padding

@@ -38,6 +38,8 @@
     Lfm2Config,
     NemotronH_Nano_VL_V2_Config,
     NemotronHConfig,
+    Qwen3_5Config,
+    Qwen3_5MoeConfig,
     Qwen3NextConfig,
 )
 from sglang.srt.configs.device_config import DeviceConfig
@@ -1498,8 +1500,15 @@ def qwen3_next_config(self):
 
     @property
     def hybrid_gdn_config(self):
-        config = self.model_config.hf_config
-        if isinstance(config, Qwen3NextConfig | JetNemotronConfig | JetVLMConfig):
+        config = self.model_config.hf_config.get_text_config()
+        if isinstance(
+            config,
+            Qwen3NextConfig
+            | Qwen3_5Config
+            | Qwen3_5MoeConfig
+            | JetNemotronConfig
+            | JetVLMConfig,
+        ):
             return config
         return None
 
@@ -2476,7 +2485,9 @@ def compute_logprobs_only(
     def model_is_mrope(self) -> bool:
         """Detect if the model has "mrope" rope_scaling type.
         mrope requires keep "rope_deltas" between prompt and decoding phases."""
-        rope_scaling = getattr(self.model_config.hf_text_config, "rope_scaling", {})
+        rope_scaling = getattr(
+            self.model_config.hf_text_config, "rope_parameters", None
+        ) or getattr(self.model_config.hf_text_config, "rope_scaling", {})
         if rope_scaling is None:
             return False
         is_mrope_enabled = "mrope_section" in rope_scaling