From e6c95145f400275261ba14171c6f52a929f3731d Mon Sep 17 00:00:00 2001
From: Xinyuan Tong <xinyuantong.cs@gmail.com>
Date: Mon, 1 Dec 2025 20:10:08 +0000
Subject: [PATCH 01/11] perpare for ministral 3

Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
---
 python/sglang/srt/configs/olmo3.py      |   2 -
 python/sglang/srt/configs/qwen3_next.py |   2 -
 python/sglang/srt/models/ministral3.py  | 151 ++++++++++++++++++++++++
 python/sglang/srt/models/pixtral.py     |   2 +
 4 files changed, 153 insertions(+), 4 deletions(-)
 create mode 100644 python/sglang/srt/models/ministral3.py

diff --git a/python/sglang/srt/configs/olmo3.py b/python/sglang/srt/configs/olmo3.py
index 95e7c2537d8d..8640342196e5 100644
--- a/python/sglang/srt/configs/olmo3.py
+++ b/python/sglang/srt/configs/olmo3.py
@@ -17,7 +17,6 @@
 import enum
 
 from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_rope_utils import rope_config_validation
 from transformers.utils import logging
 
 logger = logging.get_logger(__name__)
@@ -90,7 +89,6 @@ def __init__(
         self.use_cache = use_cache
         self.rope_theta = rope_theta
         self.rope_scaling = rope_scaling
-        rope_config_validation(self)
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
 
diff --git a/python/sglang/srt/configs/qwen3_next.py b/python/sglang/srt/configs/qwen3_next.py
index cd1b6f1ea59a..8d0981c39854 100644
--- a/python/sglang/srt/configs/qwen3_next.py
+++ b/python/sglang/srt/configs/qwen3_next.py
@@ -17,7 +17,6 @@
 import enum
 
 from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_rope_utils import rope_config_validation
 from transformers.utils import logging
 
 from sglang.srt.configs.mamba_utils import Mamba2CacheParams, Mamba2StateShape
@@ -226,7 +225,6 @@ def __init__(
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
         self.head_dim = head_dim
-        rope_config_validation(self)
 
         # linear attention (gdn now part)
         self.linear_conv_kernel_dim = linear_conv_kernel_dim
diff --git a/python/sglang/srt/models/ministral3.py b/python/sglang/srt/models/ministral3.py
new file mode 100644
index 000000000000..db8b876fe3c8
--- /dev/null
+++ b/python/sglang/srt/models/ministral3.py
@@ -0,0 +1,151 @@
+from typing import Any, Dict, Optional
+
+import torch
+from transformers import PretrainedConfig
+
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.llama import (
+    LlamaAttention,
+    LlamaDecoderLayer,
+    LlamaForCausalLM,
+    LlamaModel,
+)
+from sglang.srt.utils import add_prefix, make_layers
+
+
+def _get_llama_4_attn_scale(
+    positions_ids: torch.Tensor, beta: float, max_position_embeddings: int
+) -> torch.Tensor:
+    scaling = 1 + beta * torch.log(
+        1 + torch.floor(positions_ids / max_position_embeddings)
+    )
+    return scaling.unsqueeze(-1)
+
+
+class Ministral3Attention(LlamaAttention):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        rope_is_neox_style: bool = True,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        bias: bool = False,
+    ) -> None:
+        super().__init__(
+            config,
+            hidden_size,
+            num_heads,
+            num_kv_heads,
+            layer_id,
+            rope_theta,
+            rope_scaling,
+            rope_is_neox_style,
+            max_position_embeddings,
+            quant_config,
+            prefix,
+            bias,
+        )
+        # Ministral3 specific: llama 4 style scaling beta
+        self.llama_4_scaling_beta = None
+        if hasattr(config, "rope_parameters") and config.rope_parameters:
+            self.llama_4_scaling_beta = config.rope_parameters.get(
+                "llama_4_scaling_beta"
+            )
+
+        # sliding window
+        self.sliding_window = getattr(config, "sliding_window", None)
+        if self.sliding_window is not None:
+            # Update RadixAttention with sliding window if needed
+            # currently RadixAttention in sglang handles this mostly via logic in forward/flashinfer
+            pass
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        # Apply RoPE
+        q, k = self.rotary_emb(positions, q, k)
+
+        # Ministral3 / Llama 4 scaling
+        if self.llama_4_scaling_beta is not None:
+            scale = _get_llama_4_attn_scale(
+                positions, self.llama_4_scaling_beta, self.max_position_embeddings
+            ).to(q.dtype)
+            # q shape is [batch_size * seq_len, num_heads * head_dim] or [batch_size * seq_len, num_heads, head_dim]
+            # positions is [batch_size * seq_len]
+            # scale is [batch_size * seq_len, 1]
+            # We need to reshape q to apply scale correctly if it's flattened
+            # Assuming q is (total_tokens, num_heads * head_dim)
+            q = q.view(-1, self.num_heads, self.head_dim)
+            q = q * scale.unsqueeze(1)  # Broadcast over heads
+            q = q.view(-1, self.num_heads * self.head_dim)
+
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Ministral3DecoderLayer(LlamaDecoderLayer):
+    def __init__(self, config, layer_id=0, quant_config=None, prefix=""):
+        super().__init__(config, layer_id, quant_config, prefix)
+        self.self_attn = Ministral3Attention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=getattr(config, "rope_theta", 10000),
+            rope_scaling=getattr(config, "rope_scaling", None),
+            max_position_embeddings=getattr(config, "max_position_embeddings", 8192),
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+            bias=getattr(config, "attention_bias", False)
+            or getattr(config, "bias", False),
+        )
+
+
+class Ministral3Model(LlamaModel):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        # Override layer creation to use Ministral3Attention
+        super().__init__(config, quant_config, prefix)
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: Ministral3DecoderLayer(
+                config=config, quant_config=quant_config, layer_id=idx, prefix=prefix
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix="model.layers",
+        )
+
+
+class Ministral3ForCausalLM(LlamaForCausalLM):
+    def _init_model(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        return Ministral3Model(config, quant_config, prefix=prefix)
+
+
+EntryClass = [Ministral3ForCausalLM]
diff --git a/python/sglang/srt/models/pixtral.py b/python/sglang/srt/models/pixtral.py
index 249a5ce81bba..6ec4ce51a1ef 100644
--- a/python/sglang/srt/models/pixtral.py
+++ b/python/sglang/srt/models/pixtral.py
@@ -107,6 +107,8 @@ def __init__(
             dropout=0.0,
             use_context_forward=False,
             flatten_batch=False,
+            qkv_bias=False,
+            proj_bias=False,
             prefix=f"{prefix}.attention",
         )
 

From 1a14eb0911620bd792d536cd3cb9d2a8db76f6c6 Mon Sep 17 00:00:00 2001
From: Xinyuan Tong <xinyuantong.cs@gmail.com>
Date: Tue, 2 Dec 2025 00:09:35 +0000
Subject: [PATCH 02/11] fix rope config

Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
---
 FlashMLA                                      |  1 +
 benchmark/mmmu/bench_sglang.py                | 16 ++++++-------
 benchmark/mmmu/eval_utils.py                  | 24 ++++++++++++-------
 .../sglang/srt/model_executor/model_runner.py |  4 ++--
 python/sglang/srt/models/ministral3.py        | 10 ++++----
 5 files changed, 32 insertions(+), 23 deletions(-)
 create mode 160000 FlashMLA

diff --git a/FlashMLA b/FlashMLA
new file mode 160000
index 000000000000..1408756a88e5
--- /dev/null
+++ b/FlashMLA
@@ -0,0 +1 @@
+Subproject commit 1408756a88e52a25196b759eaf8db89d2b51b5a1
diff --git a/benchmark/mmmu/bench_sglang.py b/benchmark/mmmu/bench_sglang.py
index 9a0bf4529047..d9426ae5a3ac 100644
--- a/benchmark/mmmu/bench_sglang.py
+++ b/benchmark/mmmu/bench_sglang.py
@@ -83,9 +83,9 @@ async def process_sample(
     assert image is not None
     image_path = sample["image_path"]
     extra_body = None if lora_path is None else {"lora_path": lora_path}
-    response = await client.chat.completions.create(
-        model="default",
-        messages=[
+    payload = {
+        "model": "default",
+        "messages": [
             {
                 "role": "user",
                 "content": [
@@ -95,11 +95,11 @@ async def process_sample(
                 ],
             }
         ],
-        temperature=0,
-        max_completion_tokens=sampling_params["max_new_tokens"],
-        max_tokens=sampling_params["max_new_tokens"],
-        extra_body=extra_body,
-    )
+        "extra_body": extra_body,
+    }
+    if sampling_params:
+        payload.update(sampling_params)
+    response = await client.chat.completions.create(**payload)
     return sample, response.choices[0].message.content
 
 
diff --git a/benchmark/mmmu/eval_utils.py b/benchmark/mmmu/eval_utils.py
index 955a3bfa5e49..55efe619748a 100644
--- a/benchmark/mmmu/eval_utils.py
+++ b/benchmark/mmmu/eval_utils.py
@@ -36,7 +36,8 @@ class EvalArgs:
     profile: bool = False
     profile_number: int = 5
     concurrency: int = 1
-    max_new_tokens: int = 30
+    max_new_tokens: Optional[int] = None
+    temperature: Optional[float] = None
     response_answer_regex: str = "(.*)"
     lora_path: Optional[str] = None
 
@@ -101,6 +102,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
             default=EvalArgs.max_new_tokens,
             help="Maximum number of new tokens to generate per sample.",
         )
+        parser.add_argument(
+            "--temperature",
+            type=float,
+            default=EvalArgs.temperature,
+            help="Sampling temperature for generation.",
+        )
         parser.add_argument(
             "--response-answer-regex",
             type=str,
@@ -241,19 +248,20 @@ def process_sample(i, sample):
 
 
 def get_sampling_params(eval_args):
-    max_new_tokens = eval_args.max_new_tokens
-    temperature = 0.001
-
     extra_request_body = {}
     if eval_args.extra_request_body:
         extra_request_body = json.loads(eval_args.extra_request_body)
-
-    return {
-        "temperature": temperature,
-        "max_new_tokens": max_new_tokens,
+    sampling_params = {
         **extra_request_body,
     }
 
+    if eval_args.max_new_tokens is not None and eval_args.max_new_tokens > 0:
+        sampling_params.update({"max_completion_tokens": eval_args.max_new_tokens})
+    
+    if eval_args.temperature is not None:
+        sampling_params.update({"temperature": eval_args.temperature})
+    
+    return sampling_params
 
 # ----------- Process Multi-choice -------------
 def parse_multi_choice_response(response, all_choices, index2ans):
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 8e5bd0074848..83a503fa89fb 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -580,8 +580,8 @@ def check_quantized_moe_compatibility(self):
             quantization_config := getattr(
                 self.model_config.hf_config, "quantization_config", None
             )
-        ) is not None and "weight_block_size" in quantization_config:
-            weight_block_size_n = quantization_config["weight_block_size"][0]
+        ) is not None and (weight_block_size := quantization_config.get("weight_block_size", None)) is not None:
+            weight_block_size_n = weight_block_size[0]
 
             if self.tp_size % self.moe_ep_size != 0:
                 raise ValueError(
diff --git a/python/sglang/srt/models/ministral3.py b/python/sglang/srt/models/ministral3.py
index db8b876fe3c8..ac1678dd9f3e 100644
--- a/python/sglang/srt/models/ministral3.py
+++ b/python/sglang/srt/models/ministral3.py
@@ -31,8 +31,8 @@ def __init__(
         num_heads: int,
         num_kv_heads: int,
         layer_id: int = 0,
-        rope_theta: float = 10000,
-        rope_scaling: Optional[Dict[str, Any]] = None,
+        rope_theta: float = 1000000.0,
+        rope_scaling: Optional[Dict[str, Any]] = {},
         rope_is_neox_style: bool = True,
         max_position_embeddings: int = 8192,
         quant_config: Optional[QuantizationConfig] = None,
@@ -107,9 +107,9 @@ def __init__(self, config, layer_id=0, quant_config=None, prefix=""):
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
             layer_id=layer_id,
-            rope_theta=getattr(config, "rope_theta", 10000),
-            rope_scaling=getattr(config, "rope_scaling", None),
-            max_position_embeddings=getattr(config, "max_position_embeddings", 8192),
+            rope_theta=getattr(config, "rope_parameters", {}).get("rope_theta", 1000000.0),
+            rope_scaling=getattr(config, "rope_parameters", {}), # rope_scaling is rope_parameters in Ministral3Config
+            max_position_embeddings=getattr(config, "original_max_position_embeddings", 16384),
             quant_config=quant_config,
             prefix=add_prefix("self_attn", prefix),
             bias=getattr(config, "attention_bias", False)

From 793ccb4d4b3e7110bb0ed79c599a2d5959f6754c Mon Sep 17 00:00:00 2001
From: Xinyuan Tong <xinyuantong.cs@gmail.com>
Date: Tue, 2 Dec 2025 19:47:44 +0000
Subject: [PATCH 03/11] remap fp8 weights

Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
---
 python/sglang/srt/model_loader/weight_utils.py | 8 ++++++++
 python/sglang/srt/models/llama.py              | 5 +++++
 2 files changed, 13 insertions(+)

diff --git a/python/sglang/srt/model_loader/weight_utils.py b/python/sglang/srt/model_loader/weight_utils.py
index 4c236f7150d8..3683d8fa97c5 100644
--- a/python/sglang/srt/model_loader/weight_utils.py
+++ b/python/sglang/srt/model_loader/weight_utils.py
@@ -534,6 +534,14 @@ def filter_duplicate_safetensors_files(
     # torch state_dict to safetensors file holding that weight.
     index_file_name = os.path.join(hf_folder, index_file)
     if not os.path.isfile(index_file_name):
+        # NOTE: this is a trick of handling mistral model
+        # skip the unsupported consolidated.safetensors file
+        if len(hf_weights_files) == 2:
+            hf_weights_files.sort()
+            if hf_weights_files[0].endswith(
+                "consolidated.safetensors"
+            ) and hf_weights_files[1].endswith("model.safetensors"):
+                return [hf_weights_files[1]]
         return hf_weights_files
 
     # Iterate through the weight_map (weight_name: safetensors files)
diff --git a/python/sglang/srt/models/llama.py b/python/sglang/srt/models/llama.py
index dbf6968eef04..4176af28d704 100644
--- a/python/sglang/srt/models/llama.py
+++ b/python/sglang/srt/models/llama.py
@@ -570,6 +570,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         params_dict = dict(self.named_parameters())
 
         for name, loaded_weight in weights:
+            if name.endswith(".activation_scale"):
+                name = name.replace(".activation_scale", ".input_scale")
+            if name.endswith(".weight_scale_inv"):
+                name = name.replace(".weight_scale_inv", ".weight_scale")
+
             layer_id = get_layer_id(name)
             if (
                 layer_id is not None

From ad2c34eb22b06192c7dc74e1767e35365bff411a Mon Sep 17 00:00:00 2001
From: Xinyuan Tong <xinyuantong.cs@gmail.com>
Date: Tue, 2 Dec 2025 19:48:16 +0000
Subject: [PATCH 04/11] lint

Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
---
 benchmark/mmmu/eval_utils.py                     |  5 +++--
 python/sglang/srt/model_executor/model_runner.py |  4 +++-
 python/sglang/srt/models/ministral3.py           | 12 +++++++++---
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/benchmark/mmmu/eval_utils.py b/benchmark/mmmu/eval_utils.py
index 55efe619748a..b3edd69fc1ce 100644
--- a/benchmark/mmmu/eval_utils.py
+++ b/benchmark/mmmu/eval_utils.py
@@ -257,12 +257,13 @@ def get_sampling_params(eval_args):
 
     if eval_args.max_new_tokens is not None and eval_args.max_new_tokens > 0:
         sampling_params.update({"max_completion_tokens": eval_args.max_new_tokens})
-    
+
     if eval_args.temperature is not None:
         sampling_params.update({"temperature": eval_args.temperature})
-    
+
     return sampling_params
 
+
 # ----------- Process Multi-choice -------------
 def parse_multi_choice_response(response, all_choices, index2ans):
     """
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 83a503fa89fb..3162106229c7 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -580,7 +580,9 @@ def check_quantized_moe_compatibility(self):
             quantization_config := getattr(
                 self.model_config.hf_config, "quantization_config", None
             )
-        ) is not None and (weight_block_size := quantization_config.get("weight_block_size", None)) is not None:
+        ) is not None and (
+            weight_block_size := quantization_config.get("weight_block_size", None)
+        ) is not None:
             weight_block_size_n = weight_block_size[0]
 
             if self.tp_size % self.moe_ep_size != 0:
diff --git a/python/sglang/srt/models/ministral3.py b/python/sglang/srt/models/ministral3.py
index ac1678dd9f3e..460c7b30fb5e 100644
--- a/python/sglang/srt/models/ministral3.py
+++ b/python/sglang/srt/models/ministral3.py
@@ -107,9 +107,15 @@ def __init__(self, config, layer_id=0, quant_config=None, prefix=""):
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
             layer_id=layer_id,
-            rope_theta=getattr(config, "rope_parameters", {}).get("rope_theta", 1000000.0),
-            rope_scaling=getattr(config, "rope_parameters", {}), # rope_scaling is rope_parameters in Ministral3Config
-            max_position_embeddings=getattr(config, "original_max_position_embeddings", 16384),
+            rope_theta=getattr(config, "rope_parameters", {}).get(
+                "rope_theta", 1000000.0
+            ),
+            rope_scaling=getattr(
+                config, "rope_parameters", {}
+            ),  # rope_scaling is rope_parameters in Ministral3Config
+            max_position_embeddings=getattr(
+                config, "original_max_position_embeddings", 16384
+            ),
             quant_config=quant_config,
             prefix=add_prefix("self_attn", prefix),
             bias=getattr(config, "attention_bias", False)

From d4e5f2903a541c60570761ffdfa71e1b27adbd1a Mon Sep 17 00:00:00 2001
From: Yueming Yuan <yy28@illinois.edu>
Date: Tue, 2 Dec 2025 16:01:36 -0800
Subject: [PATCH 05/11] fix ministral fp8 vision model

---
 python/sglang/srt/layers/quantization/fp8.py   | 5 ++++-
 python/sglang/srt/layers/quantization/utils.py | 4 ++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py
index 878f70619728..2b45819eb402 100644
--- a/python/sglang/srt/layers/quantization/fp8.py
+++ b/python/sglang/srt/layers/quantization/fp8.py
@@ -158,7 +158,10 @@ def from_config(cls, config: Dict[str, Any]) -> Fp8Config:
         quant_method = cls.get_from_keys(config, ["quant_method"])
         is_checkpoint_fp8_serialized = "fp8" in quant_method
         activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
-        ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None)
+        ignored_layers = cls.get_from_keys_or(config, ["ignored_layers", "modules_to_not_convert"], None)
+        if ignored_layers:
+            # hacking ministral
+            ignored_layers = [layer.replace("model.", "") for layer in ignored_layers]
         weight_block_size = cls.get_from_keys_or(config, ["weight_block_size"], None)
         return cls(
             is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
diff --git a/python/sglang/srt/layers/quantization/utils.py b/python/sglang/srt/layers/quantization/utils.py
index d407b95f2776..a9483357dad7 100644
--- a/python/sglang/srt/layers/quantization/utils.py
+++ b/python/sglang/srt/layers/quantization/utils.py
@@ -64,7 +64,7 @@ def is_layer_skipped(
 
         is_skipped = None
         for shard_prefix in shard_prefixes:
-            is_shard_skipped = shard_prefix in ignored_layers
+            is_shard_skipped = any(ignored in shard_prefix for ignored in ignored_layers)
 
             if is_skipped is None:
                 is_skipped = is_shard_skipped
@@ -75,7 +75,7 @@ def is_layer_skipped(
                     "to have the same precision."
                 )
     else:
-        is_skipped = prefix in ignored_layers
+        is_skipped = any(ignored in prefix for ignored in ignored_layers)
         if "gate_up_proj" in prefix:
             prefix_gate = prefix.replace("gate_up_proj", "gate_proj")
             prefix_up = prefix.replace("gate_up_proj", "up_proj")

From 14799d16555d8fd94a024fd5017c227f5c3b8357 Mon Sep 17 00:00:00 2001
From: Yueming Yuan <yy28@illinois.edu>
Date: Tue, 2 Dec 2025 19:10:26 -0800
Subject: [PATCH 06/11] lint

---
 python/sglang/srt/layers/quantization/fp8.py   | 4 +++-
 python/sglang/srt/layers/quantization/utils.py | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py
index 2b45819eb402..e8952fbe4ffe 100644
--- a/python/sglang/srt/layers/quantization/fp8.py
+++ b/python/sglang/srt/layers/quantization/fp8.py
@@ -158,7 +158,9 @@ def from_config(cls, config: Dict[str, Any]) -> Fp8Config:
         quant_method = cls.get_from_keys(config, ["quant_method"])
         is_checkpoint_fp8_serialized = "fp8" in quant_method
         activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
-        ignored_layers = cls.get_from_keys_or(config, ["ignored_layers", "modules_to_not_convert"], None)
+        ignored_layers = cls.get_from_keys_or(
+            config, ["ignored_layers", "modules_to_not_convert"], None
+        )
         if ignored_layers:
             # hacking ministral
             ignored_layers = [layer.replace("model.", "") for layer in ignored_layers]
diff --git a/python/sglang/srt/layers/quantization/utils.py b/python/sglang/srt/layers/quantization/utils.py
index a9483357dad7..fc81f3140660 100644
--- a/python/sglang/srt/layers/quantization/utils.py
+++ b/python/sglang/srt/layers/quantization/utils.py
@@ -64,7 +64,9 @@ def is_layer_skipped(
 
         is_skipped = None
         for shard_prefix in shard_prefixes:
-            is_shard_skipped = any(ignored in shard_prefix for ignored in ignored_layers)
+            is_shard_skipped = any(
+                ignored in shard_prefix for ignored in ignored_layers
+            )
 
             if is_skipped is None:
                 is_skipped = is_shard_skipped

From 40e3baf4dae4c3af2918d387972f85cc2be8e255 Mon Sep 17 00:00:00 2001
From: Yueming Yuan <yy28@illinois.edu>
Date: Wed, 3 Dec 2025 02:06:08 -0800
Subject: [PATCH 07/11] add unit test

---
 test/srt/models/test_ministral3_models.py | 31 +++++++++++++++++++++++
 test/srt/run_suite.py                     |  1 +
 2 files changed, 32 insertions(+)
 create mode 100644 test/srt/models/test_ministral3_models.py

diff --git a/test/srt/models/test_ministral3_models.py b/test/srt/models/test_ministral3_models.py
new file mode 100644
index 000000000000..27ddd7d660e7
--- /dev/null
+++ b/test/srt/models/test_ministral3_models.py
@@ -0,0 +1,31 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.test.gsm8k_mixin import GSM8KMixin
+from sglang.test.mmmu_vlm_mixin import MMMUVLMMixin
+from sglang.test.test_utils import CustomTestCase
+
+MODEL = "mistralai/Ministral-3-3B-Instruct-2512"
+
+
+class TestMinistral3TextOnly(GSM8KMixin, CustomTestCase):
+    accuracy = 0.6
+    model = MODEL
+    other_args = ["--trust-remote-code"]
+
+
+class TestMinistral3MMMU(MMMUVLMMixin, CustomTestCase):
+    accuracy = 0.3 
+    model = MODEL
+    other_args = ["--trust-remote-code"]
+    mmmu_args = ["--limit=0.1"]
+    """`--limit=0.1`: 10 percent of each task - this is fine for testing since the nominal result isn't interesting - this run is just to prevent relative regressions."""
+
+    def test_vlm_mmmu_benchmark(self):
+        self._run_vlm_mmmu_test(
+            SimpleNamespace(model=self.model, mmmu_accuracy=self.accuracy), "./logs"
+        )
+
+if __name__ == "__main__":
+    unittest.main()
+
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 49655d22045f..c74cf81cfad2 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -24,6 +24,7 @@
         TestFile("models/test_embedding_models.py", 73),
         TestFile("models/test_encoder_embedding_models.py", 460),
         TestFile("models/test_generation_models.py", 103),
+        TestFile("models/test_ministral3_models.py", 350),
         TestFile("models/test_nvidia_nemotron_nano_v2.py", 160),
         TestFile("models/test_nvidia_nemotron_nano_v2_vl.py", 350),  # GSM8k + MMMU
         TestFile("models/test_qwen_models.py", 150),

From 93a13ed7333cf229116973af6aece52dd3038007 Mon Sep 17 00:00:00 2001
From: Xinyuan Tong <xinyuantong.cs@gmail.com>
Date: Thu, 4 Dec 2025 14:35:04 +0000
Subject: [PATCH 08/11] remove submodule FlashMLA

---
 FlashMLA | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 FlashMLA

diff --git a/FlashMLA b/FlashMLA
deleted file mode 160000
index 1408756a88e5..000000000000
--- a/FlashMLA
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 1408756a88e52a25196b759eaf8db89d2b51b5a1

From fc69f4a78c11b205d570ecd149964e30903366db Mon Sep 17 00:00:00 2001
From: Xinyuan Tong <xinyuantong.cs@gmail.com>
Date: Thu, 4 Dec 2025 14:35:34 +0000
Subject: [PATCH 09/11] lint

Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
---
 test/srt/models/test_ministral3_models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/srt/models/test_ministral3_models.py b/test/srt/models/test_ministral3_models.py
index 27ddd7d660e7..6f6ca51b4e8d 100644
--- a/test/srt/models/test_ministral3_models.py
+++ b/test/srt/models/test_ministral3_models.py
@@ -15,7 +15,7 @@ class TestMinistral3TextOnly(GSM8KMixin, CustomTestCase):
 
 
 class TestMinistral3MMMU(MMMUVLMMixin, CustomTestCase):
-    accuracy = 0.3 
+    accuracy = 0.3
     model = MODEL
     other_args = ["--trust-remote-code"]
     mmmu_args = ["--limit=0.1"]
@@ -26,6 +26,6 @@ def test_vlm_mmmu_benchmark(self):
             SimpleNamespace(model=self.model, mmmu_accuracy=self.accuracy), "./logs"
         )
 
+
 if __name__ == "__main__":
     unittest.main()
-

From 1e8fd6435b61b0ef3c68952c10b23cec5763fd86 Mon Sep 17 00:00:00 2001
From: Xinyuan Tong <xinyuantong.cs@gmail.com>
Date: Thu, 4 Dec 2025 15:04:15 +0000
Subject: [PATCH 10/11] update

Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
---
 test/srt/run_suite.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index b37866c787e4..65e11705aa3c 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -214,6 +214,7 @@
         TestFile("test_moe_eval_accuracy_large.py"),
         TestFile("test_vision_openai_server_common.py"),
         TestFile("test_profile_v2.py"),
+        TestFile("models/test_ministral3_models.py"),
     ],
 }
 

From 44beef7de1fc9284005bbf5fdad88aa47d51c96e Mon Sep 17 00:00:00 2001
From: Xinyuan Tong <xinyuantong.cs@gmail.com>
Date: Thu, 4 Dec 2025 16:00:59 +0000
Subject: [PATCH 11/11] fix processor conflict

Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
---
 python/sglang/srt/multimodal/processors/pixtral.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/multimodal/processors/pixtral.py b/python/sglang/srt/multimodal/processors/pixtral.py
index 6b6ab34ad1a0..b923ff342a19 100644
--- a/python/sglang/srt/multimodal/processors/pixtral.py
+++ b/python/sglang/srt/multimodal/processors/pixtral.py
@@ -56,7 +56,8 @@ def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
         self.patch_size = self.vision_config.patch_size
 
         self._processor.patch_size = self.patch_size
-        self._processor.spatial_merge_size = self.vision_config.spatial_merge_size
+        if hasattr(self.vision_config, "spatial_merge_size"):
+            self._processor.spatial_merge_size = self.vision_config.spatial_merge_size
 
         self.mm_tokens = MultimodalSpecialTokens(
             image_token=_processor.image_token,