sgl-project · ispobock · Feb 19, 2026 · Feb 17, 2026
@@ -3411,7 +3411,9 @@ def get_rope(
         elif "type" in rope_scaling:
             scaling_type = rope_scaling["type"]
         else:
-            raise ValueError("Unknown RoPE scaling type")
+            raise ValueError(
+                f"Unknown RoPE scaling type, rope_scaling is {rope_scaling}"
+            )
 
         if scaling_type == "llama3":
             scaling_factor = rope_scaling["factor"]

diff --git a/python/sglang/srt/models/qwen3_5.py b/python/sglang/srt/models/qwen3_5.py
@@ -318,8 +318,14 @@ def __init__(
         super().__init__()
         self.config = config
         self.layer_id = layer_id
+
+        linear_attn_quant_config = (
+            None
+            if quant_config and quant_config.get_name() == "modelopt_fp4"
+            else quant_config
+        )
         self.linear_attn = Qwen3_5GatedDeltaNet(
-            config, layer_id, quant_config, alt_stream, prefix
+            config, layer_id, linear_attn_quant_config, alt_stream, prefix
         )
 
         # NOTE: Determine the MLP type based on the model type
@@ -458,13 +464,19 @@ def __init__(
             dtype=torch.get_default_dtype(),
         )
 
+        attn_quant_config = (
+            None
+            if quant_config and quant_config.get_name() == "modelopt_fp4"
+            else quant_config
+        )
+
         self.qkv_proj = QKVParallelLinear(
             config.hidden_size,
             self.head_dim,
             self.total_num_heads * (1 + self.attn_output_gate),
             self.total_num_kv_heads,
             bias=False,
-            quant_config=quant_config,
+            quant_config=attn_quant_config,
             tp_rank=self.attn_tp_rank,
             tp_size=self.attn_tp_size,
             prefix=add_prefix("qkv_proj", prefix),
@@ -474,7 +486,7 @@ def __init__(
             self.total_num_heads * self.head_dim,
             config.hidden_size,
             bias=False,
-            quant_config=quant_config,
+            quant_config=attn_quant_config,
             reduce_results=False,
             tp_rank=self.attn_tp_rank,
             tp_size=self.attn_tp_size,
@@ -1155,9 +1167,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             "_k_scale",
             ".v_scale",
             "_v_scale",
-            ".weight_scale",
             "_weight_scale",
-            ".input_scale",
             "_input_scale",
         )
 
@@ -1204,7 +1214,9 @@ def load_fused_expert_weights(
                 name = name.replace(".self_attn", "")
 
             for param_name, weight_name, shard_id in stacked_params_mapping:
-                if "experts.gate_up_proj" in name or "experts.down_proj" in name:
+                if name.endswith("experts.gate_up_proj") or name.endswith(
+                    "experts.down_proj"
+                ):
                     is_fused_expert = True
                     expert_params_mapping = fused_expert_params_mapping
 
@@ -1274,7 +1286,7 @@ def load_fused_expert_weights(
                                 num_experts,
                             )
                     else:
-                        # Skip loading extra parameters for GPTQ/modelopt models.
+                        # Skip loading extra parameters for GPTQ models.
                         if (
                             name_mapped.endswith(ignore_suffixes)
                             and name_mapped not in params_dict

diff --git a/python/sglang/srt/models/qwen3_5_mtp.py b/python/sglang/srt/models/qwen3_5_mtp.py
@@ -48,6 +48,10 @@ def __init__(
         if self.is_multimodal:
             config = config.text_config
 
+        # The MTP model is unquantized in the nvfp4 checkpoint.
+        if quant_config and quant_config.get_name() == "modelopt_fp4":
+            quant_config = None
+
         self.config = config
         self.tp_size = get_tensor_model_parallel_world_size()
         self.quant_config = quant_config