From e6c95145f400275261ba14171c6f52a929f3731d Mon Sep 17 00:00:00 2001 From: Xinyuan Tong Date: Mon, 1 Dec 2025 20:10:08 +0000 Subject: [PATCH 01/11] perpare for ministral 3 Signed-off-by: Xinyuan Tong --- python/sglang/srt/configs/olmo3.py | 2 - python/sglang/srt/configs/qwen3_next.py | 2 - python/sglang/srt/models/ministral3.py | 151 ++++++++++++++++++++++++ python/sglang/srt/models/pixtral.py | 2 + 4 files changed, 153 insertions(+), 4 deletions(-) create mode 100644 python/sglang/srt/models/ministral3.py diff --git a/python/sglang/srt/configs/olmo3.py b/python/sglang/srt/configs/olmo3.py index 95e7c2537d8d..8640342196e5 100644 --- a/python/sglang/srt/configs/olmo3.py +++ b/python/sglang/srt/configs/olmo3.py @@ -17,7 +17,6 @@ import enum from transformers.configuration_utils import PretrainedConfig -from transformers.modeling_rope_utils import rope_config_validation from transformers.utils import logging logger = logging.get_logger(__name__) @@ -90,7 +89,6 @@ def __init__( self.use_cache = use_cache self.rope_theta = rope_theta self.rope_scaling = rope_scaling - rope_config_validation(self) self.attention_bias = attention_bias self.attention_dropout = attention_dropout diff --git a/python/sglang/srt/configs/qwen3_next.py b/python/sglang/srt/configs/qwen3_next.py index cd1b6f1ea59a..8d0981c39854 100644 --- a/python/sglang/srt/configs/qwen3_next.py +++ b/python/sglang/srt/configs/qwen3_next.py @@ -17,7 +17,6 @@ import enum from transformers.configuration_utils import PretrainedConfig -from transformers.modeling_rope_utils import rope_config_validation from transformers.utils import logging from sglang.srt.configs.mamba_utils import Mamba2CacheParams, Mamba2StateShape @@ -226,7 +225,6 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.head_dim = head_dim - rope_config_validation(self) # linear attention (gdn now part) self.linear_conv_kernel_dim = linear_conv_kernel_dim diff --git a/python/sglang/srt/models/ministral3.py b/python/sglang/srt/models/ministral3.py new file mode 100644 index 000000000000..db8b876fe3c8 --- /dev/null +++ b/python/sglang/srt/models/ministral3.py @@ -0,0 +1,151 @@ +from typing import Any, Dict, Optional + +import torch +from transformers import PretrainedConfig + +from sglang.srt.layers.quantization.base_config import QuantizationConfig +from sglang.srt.model_executor.forward_batch_info import ForwardBatch +from sglang.srt.models.llama import ( + LlamaAttention, + LlamaDecoderLayer, + LlamaForCausalLM, + LlamaModel, +) +from sglang.srt.utils import add_prefix, make_layers + + +def _get_llama_4_attn_scale( + positions_ids: torch.Tensor, beta: float, max_position_embeddings: int +) -> torch.Tensor: + scaling = 1 + beta * torch.log( + 1 + torch.floor(positions_ids / max_position_embeddings) + ) + return scaling.unsqueeze(-1) + + +class Ministral3Attention(LlamaAttention): + def __init__( + self, + config: PretrainedConfig, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + layer_id: int = 0, + rope_theta: float = 10000, + rope_scaling: Optional[Dict[str, Any]] = None, + rope_is_neox_style: bool = True, + max_position_embeddings: int = 8192, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + bias: bool = False, + ) -> None: + super().__init__( + config, + hidden_size, + num_heads, + num_kv_heads, + layer_id, + rope_theta, + rope_scaling, + rope_is_neox_style, + max_position_embeddings, + quant_config, + prefix, + bias, + ) + # Ministral3 specific: llama 4 style scaling beta + self.llama_4_scaling_beta = None + if hasattr(config, "rope_parameters") and config.rope_parameters: + self.llama_4_scaling_beta = config.rope_parameters.get( + "llama_4_scaling_beta" + ) + + # sliding window + self.sliding_window = getattr(config, "sliding_window", None) + if self.sliding_window is not None: + # Update RadixAttention with sliding window if needed + # currently RadixAttention in sglang handles this mostly via logic in forward/flashinfer + pass + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + forward_batch: ForwardBatch, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + + # Apply RoPE + q, k = self.rotary_emb(positions, q, k) + + # Ministral3 / Llama 4 scaling + if self.llama_4_scaling_beta is not None: + scale = _get_llama_4_attn_scale( + positions, self.llama_4_scaling_beta, self.max_position_embeddings + ).to(q.dtype) + # q shape is [batch_size * seq_len, num_heads * head_dim] or [batch_size * seq_len, num_heads, head_dim] + # positions is [batch_size * seq_len] + # scale is [batch_size * seq_len, 1] + # We need to reshape q to apply scale correctly if it's flattened + # Assuming q is (total_tokens, num_heads * head_dim) + q = q.view(-1, self.num_heads, self.head_dim) + q = q * scale.unsqueeze(1) # Broadcast over heads + q = q.view(-1, self.num_heads * self.head_dim) + + attn_output = self.attn(q, k, v, forward_batch) + output, _ = self.o_proj(attn_output) + return output + + +class Ministral3DecoderLayer(LlamaDecoderLayer): + def __init__(self, config, layer_id=0, quant_config=None, prefix=""): + super().__init__(config, layer_id, quant_config, prefix) + self.self_attn = Ministral3Attention( + config=config, + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + layer_id=layer_id, + rope_theta=getattr(config, "rope_theta", 10000), + rope_scaling=getattr(config, "rope_scaling", None), + max_position_embeddings=getattr(config, "max_position_embeddings", 8192), + quant_config=quant_config, + prefix=add_prefix("self_attn", prefix), + bias=getattr(config, "attention_bias", False) + or getattr(config, "bias", False), + ) + + +class Ministral3Model(LlamaModel): + def __init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + # Override layer creation to use Ministral3Attention + super().__init__(config, quant_config, prefix) + + self.layers, self.start_layer, self.end_layer = make_layers( + config.num_hidden_layers, + lambda idx, prefix: Ministral3DecoderLayer( + config=config, quant_config=quant_config, layer_id=idx, prefix=prefix + ), + pp_rank=self.pp_group.rank_in_group, + pp_size=self.pp_group.world_size, + prefix="model.layers", + ) + + +class Ministral3ForCausalLM(LlamaForCausalLM): + def _init_model( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + return Ministral3Model(config, quant_config, prefix=prefix) + + +EntryClass = [Ministral3ForCausalLM] diff --git a/python/sglang/srt/models/pixtral.py b/python/sglang/srt/models/pixtral.py index 249a5ce81bba..6ec4ce51a1ef 100644 --- a/python/sglang/srt/models/pixtral.py +++ b/python/sglang/srt/models/pixtral.py @@ -107,6 +107,8 @@ def __init__( dropout=0.0, use_context_forward=False, flatten_batch=False, + qkv_bias=False, + proj_bias=False, prefix=f"{prefix}.attention", ) From 1a14eb0911620bd792d536cd3cb9d2a8db76f6c6 Mon Sep 17 00:00:00 2001 From: Xinyuan Tong Date: Tue, 2 Dec 2025 00:09:35 +0000 Subject: [PATCH 02/11] fix rope config Signed-off-by: Xinyuan Tong --- FlashMLA | 1 + benchmark/mmmu/bench_sglang.py | 16 ++++++------- benchmark/mmmu/eval_utils.py | 24 ++++++++++++------- .../sglang/srt/model_executor/model_runner.py | 4 ++-- python/sglang/srt/models/ministral3.py | 10 ++++---- 5 files changed, 32 insertions(+), 23 deletions(-) create mode 160000 FlashMLA diff --git a/FlashMLA b/FlashMLA new file mode 160000 index 000000000000..1408756a88e5 --- /dev/null +++ b/FlashMLA @@ -0,0 +1 @@ +Subproject commit 1408756a88e52a25196b759eaf8db89d2b51b5a1 diff --git a/benchmark/mmmu/bench_sglang.py b/benchmark/mmmu/bench_sglang.py index 9a0bf4529047..d9426ae5a3ac 100644 --- a/benchmark/mmmu/bench_sglang.py +++ b/benchmark/mmmu/bench_sglang.py @@ -83,9 +83,9 @@ async def process_sample( assert image is not None image_path = sample["image_path"] extra_body = None if lora_path is None else {"lora_path": lora_path} - response = await client.chat.completions.create( - model="default", - messages=[ + payload = { + "model": "default", + "messages": [ { "role": "user", "content": [ @@ -95,11 +95,11 @@ async def process_sample( ], } ], - temperature=0, - max_completion_tokens=sampling_params["max_new_tokens"], - max_tokens=sampling_params["max_new_tokens"], - extra_body=extra_body, - ) + "extra_body": extra_body, + } + if sampling_params: + payload.update(sampling_params) + response = await client.chat.completions.create(**payload) return sample, response.choices[0].message.content diff --git a/benchmark/mmmu/eval_utils.py b/benchmark/mmmu/eval_utils.py index 955a3bfa5e49..55efe619748a 100644 --- a/benchmark/mmmu/eval_utils.py +++ b/benchmark/mmmu/eval_utils.py @@ -36,7 +36,8 @@ class EvalArgs: profile: bool = False profile_number: int = 5 concurrency: int = 1 - max_new_tokens: int = 30 + max_new_tokens: Optional[int] = None + temperature: Optional[float] = None response_answer_regex: str = "(.*)" lora_path: Optional[str] = None @@ -101,6 +102,12 @@ def add_cli_args(parser: argparse.ArgumentParser): default=EvalArgs.max_new_tokens, help="Maximum number of new tokens to generate per sample.", ) + parser.add_argument( + "--temperature", + type=float, + default=EvalArgs.temperature, + help="Sampling temperature for generation.", + ) parser.add_argument( "--response-answer-regex", type=str, @@ -241,19 +248,20 @@ def process_sample(i, sample): def get_sampling_params(eval_args): - max_new_tokens = eval_args.max_new_tokens - temperature = 0.001 - extra_request_body = {} if eval_args.extra_request_body: extra_request_body = json.loads(eval_args.extra_request_body) - - return { - "temperature": temperature, - "max_new_tokens": max_new_tokens, + sampling_params = { **extra_request_body, } + if eval_args.max_new_tokens is not None and eval_args.max_new_tokens > 0: + sampling_params.update({"max_completion_tokens": eval_args.max_new_tokens}) + + if eval_args.temperature is not None: + sampling_params.update({"temperature": eval_args.temperature}) + + return sampling_params # ----------- Process Multi-choice ------------- def parse_multi_choice_response(response, all_choices, index2ans): diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 8e5bd0074848..83a503fa89fb 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -580,8 +580,8 @@ def check_quantized_moe_compatibility(self): quantization_config := getattr( self.model_config.hf_config, "quantization_config", None ) - ) is not None and "weight_block_size" in quantization_config: - weight_block_size_n = quantization_config["weight_block_size"][0] + ) is not None and (weight_block_size := quantization_config.get("weight_block_size", None)) is not None: + weight_block_size_n = weight_block_size[0] if self.tp_size % self.moe_ep_size != 0: raise ValueError( diff --git a/python/sglang/srt/models/ministral3.py b/python/sglang/srt/models/ministral3.py index db8b876fe3c8..ac1678dd9f3e 100644 --- a/python/sglang/srt/models/ministral3.py +++ b/python/sglang/srt/models/ministral3.py @@ -31,8 +31,8 @@ def __init__( num_heads: int, num_kv_heads: int, layer_id: int = 0, - rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_theta: float = 1000000.0, + rope_scaling: Optional[Dict[str, Any]] = {}, rope_is_neox_style: bool = True, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, @@ -107,9 +107,9 @@ def __init__(self, config, layer_id=0, quant_config=None, prefix=""): num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, layer_id=layer_id, - rope_theta=getattr(config, "rope_theta", 10000), - rope_scaling=getattr(config, "rope_scaling", None), - max_position_embeddings=getattr(config, "max_position_embeddings", 8192), + rope_theta=getattr(config, "rope_parameters", {}).get("rope_theta", 1000000.0), + rope_scaling=getattr(config, "rope_parameters", {}), # rope_scaling is rope_parameters in Ministral3Config + max_position_embeddings=getattr(config, "original_max_position_embeddings", 16384), quant_config=quant_config, prefix=add_prefix("self_attn", prefix), bias=getattr(config, "attention_bias", False) From 793ccb4d4b3e7110bb0ed79c599a2d5959f6754c Mon Sep 17 00:00:00 2001 From: Xinyuan Tong Date: Tue, 2 Dec 2025 19:47:44 +0000 Subject: [PATCH 03/11] remap fp8 weights Signed-off-by: Xinyuan Tong --- python/sglang/srt/model_loader/weight_utils.py | 8 ++++++++ python/sglang/srt/models/llama.py | 5 +++++ 2 files changed, 13 insertions(+) diff --git a/python/sglang/srt/model_loader/weight_utils.py b/python/sglang/srt/model_loader/weight_utils.py index 4c236f7150d8..3683d8fa97c5 100644 --- a/python/sglang/srt/model_loader/weight_utils.py +++ b/python/sglang/srt/model_loader/weight_utils.py @@ -534,6 +534,14 @@ def filter_duplicate_safetensors_files( # torch state_dict to safetensors file holding that weight. index_file_name = os.path.join(hf_folder, index_file) if not os.path.isfile(index_file_name): + # NOTE: this is a trick of handling mistral model + # skip the unsupported consolidated.safetensors file + if len(hf_weights_files) == 2: + hf_weights_files.sort() + if hf_weights_files[0].endswith( + "consolidated.safetensors" + ) and hf_weights_files[1].endswith("model.safetensors"): + return [hf_weights_files[1]] return hf_weights_files # Iterate through the weight_map (weight_name: safetensors files) diff --git a/python/sglang/srt/models/llama.py b/python/sglang/srt/models/llama.py index dbf6968eef04..4176af28d704 100644 --- a/python/sglang/srt/models/llama.py +++ b/python/sglang/srt/models/llama.py @@ -570,6 +570,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): params_dict = dict(self.named_parameters()) for name, loaded_weight in weights: + if name.endswith(".activation_scale"): + name = name.replace(".activation_scale", ".input_scale") + if name.endswith(".weight_scale_inv"): + name = name.replace(".weight_scale_inv", ".weight_scale") + layer_id = get_layer_id(name) if ( layer_id is not None From ad2c34eb22b06192c7dc74e1767e35365bff411a Mon Sep 17 00:00:00 2001 From: Xinyuan Tong Date: Tue, 2 Dec 2025 19:48:16 +0000 Subject: [PATCH 04/11] lint Signed-off-by: Xinyuan Tong --- benchmark/mmmu/eval_utils.py | 5 +++-- python/sglang/srt/model_executor/model_runner.py | 4 +++- python/sglang/srt/models/ministral3.py | 12 +++++++++--- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/benchmark/mmmu/eval_utils.py b/benchmark/mmmu/eval_utils.py index 55efe619748a..b3edd69fc1ce 100644 --- a/benchmark/mmmu/eval_utils.py +++ b/benchmark/mmmu/eval_utils.py @@ -257,12 +257,13 @@ def get_sampling_params(eval_args): if eval_args.max_new_tokens is not None and eval_args.max_new_tokens > 0: sampling_params.update({"max_completion_tokens": eval_args.max_new_tokens}) - + if eval_args.temperature is not None: sampling_params.update({"temperature": eval_args.temperature}) - + return sampling_params + # ----------- Process Multi-choice ------------- def parse_multi_choice_response(response, all_choices, index2ans): """ diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 83a503fa89fb..3162106229c7 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -580,7 +580,9 @@ def check_quantized_moe_compatibility(self): quantization_config := getattr( self.model_config.hf_config, "quantization_config", None ) - ) is not None and (weight_block_size := quantization_config.get("weight_block_size", None)) is not None: + ) is not None and ( + weight_block_size := quantization_config.get("weight_block_size", None) + ) is not None: weight_block_size_n = weight_block_size[0] if self.tp_size % self.moe_ep_size != 0: diff --git a/python/sglang/srt/models/ministral3.py b/python/sglang/srt/models/ministral3.py index ac1678dd9f3e..460c7b30fb5e 100644 --- a/python/sglang/srt/models/ministral3.py +++ b/python/sglang/srt/models/ministral3.py @@ -107,9 +107,15 @@ def __init__(self, config, layer_id=0, quant_config=None, prefix=""): num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, layer_id=layer_id, - rope_theta=getattr(config, "rope_parameters", {}).get("rope_theta", 1000000.0), - rope_scaling=getattr(config, "rope_parameters", {}), # rope_scaling is rope_parameters in Ministral3Config - max_position_embeddings=getattr(config, "original_max_position_embeddings", 16384), + rope_theta=getattr(config, "rope_parameters", {}).get( + "rope_theta", 1000000.0 + ), + rope_scaling=getattr( + config, "rope_parameters", {} + ), # rope_scaling is rope_parameters in Ministral3Config + max_position_embeddings=getattr( + config, "original_max_position_embeddings", 16384 + ), quant_config=quant_config, prefix=add_prefix("self_attn", prefix), bias=getattr(config, "attention_bias", False) From d4e5f2903a541c60570761ffdfa71e1b27adbd1a Mon Sep 17 00:00:00 2001 From: Yueming Yuan Date: Tue, 2 Dec 2025 16:01:36 -0800 Subject: [PATCH 05/11] fix ministral fp8 vision model --- python/sglang/srt/layers/quantization/fp8.py | 5 ++++- python/sglang/srt/layers/quantization/utils.py | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py index 878f70619728..2b45819eb402 100644 --- a/python/sglang/srt/layers/quantization/fp8.py +++ b/python/sglang/srt/layers/quantization/fp8.py @@ -158,7 +158,10 @@ def from_config(cls, config: Dict[str, Any]) -> Fp8Config: quant_method = cls.get_from_keys(config, ["quant_method"]) is_checkpoint_fp8_serialized = "fp8" in quant_method activation_scheme = cls.get_from_keys(config, ["activation_scheme"]) - ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None) + ignored_layers = cls.get_from_keys_or(config, ["ignored_layers", "modules_to_not_convert"], None) + if ignored_layers: + # hacking ministral + ignored_layers = [layer.replace("model.", "") for layer in ignored_layers] weight_block_size = cls.get_from_keys_or(config, ["weight_block_size"], None) return cls( is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized, diff --git a/python/sglang/srt/layers/quantization/utils.py b/python/sglang/srt/layers/quantization/utils.py index d407b95f2776..a9483357dad7 100644 --- a/python/sglang/srt/layers/quantization/utils.py +++ b/python/sglang/srt/layers/quantization/utils.py @@ -64,7 +64,7 @@ def is_layer_skipped( is_skipped = None for shard_prefix in shard_prefixes: - is_shard_skipped = shard_prefix in ignored_layers + is_shard_skipped = any(ignored in shard_prefix for ignored in ignored_layers) if is_skipped is None: is_skipped = is_shard_skipped @@ -75,7 +75,7 @@ def is_layer_skipped( "to have the same precision." ) else: - is_skipped = prefix in ignored_layers + is_skipped = any(ignored in prefix for ignored in ignored_layers) if "gate_up_proj" in prefix: prefix_gate = prefix.replace("gate_up_proj", "gate_proj") prefix_up = prefix.replace("gate_up_proj", "up_proj") From 14799d16555d8fd94a024fd5017c227f5c3b8357 Mon Sep 17 00:00:00 2001 From: Yueming Yuan Date: Tue, 2 Dec 2025 19:10:26 -0800 Subject: [PATCH 06/11] lint --- python/sglang/srt/layers/quantization/fp8.py | 4 +++- python/sglang/srt/layers/quantization/utils.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py index 2b45819eb402..e8952fbe4ffe 100644 --- a/python/sglang/srt/layers/quantization/fp8.py +++ b/python/sglang/srt/layers/quantization/fp8.py @@ -158,7 +158,9 @@ def from_config(cls, config: Dict[str, Any]) -> Fp8Config: quant_method = cls.get_from_keys(config, ["quant_method"]) is_checkpoint_fp8_serialized = "fp8" in quant_method activation_scheme = cls.get_from_keys(config, ["activation_scheme"]) - ignored_layers = cls.get_from_keys_or(config, ["ignored_layers", "modules_to_not_convert"], None) + ignored_layers = cls.get_from_keys_or( + config, ["ignored_layers", "modules_to_not_convert"], None + ) if ignored_layers: # hacking ministral ignored_layers = [layer.replace("model.", "") for layer in ignored_layers] diff --git a/python/sglang/srt/layers/quantization/utils.py b/python/sglang/srt/layers/quantization/utils.py index a9483357dad7..fc81f3140660 100644 --- a/python/sglang/srt/layers/quantization/utils.py +++ b/python/sglang/srt/layers/quantization/utils.py @@ -64,7 +64,9 @@ def is_layer_skipped( is_skipped = None for shard_prefix in shard_prefixes: - is_shard_skipped = any(ignored in shard_prefix for ignored in ignored_layers) + is_shard_skipped = any( + ignored in shard_prefix for ignored in ignored_layers + ) if is_skipped is None: is_skipped = is_shard_skipped From 40e3baf4dae4c3af2918d387972f85cc2be8e255 Mon Sep 17 00:00:00 2001 From: Yueming Yuan Date: Wed, 3 Dec 2025 02:06:08 -0800 Subject: [PATCH 07/11] add unit test --- test/srt/models/test_ministral3_models.py | 31 +++++++++++++++++++++++ test/srt/run_suite.py | 1 + 2 files changed, 32 insertions(+) create mode 100644 test/srt/models/test_ministral3_models.py diff --git a/test/srt/models/test_ministral3_models.py b/test/srt/models/test_ministral3_models.py new file mode 100644 index 000000000000..27ddd7d660e7 --- /dev/null +++ b/test/srt/models/test_ministral3_models.py @@ -0,0 +1,31 @@ +import unittest +from types import SimpleNamespace + +from sglang.test.gsm8k_mixin import GSM8KMixin +from sglang.test.mmmu_vlm_mixin import MMMUVLMMixin +from sglang.test.test_utils import CustomTestCase + +MODEL = "mistralai/Ministral-3-3B-Instruct-2512" + + +class TestMinistral3TextOnly(GSM8KMixin, CustomTestCase): + accuracy = 0.6 + model = MODEL + other_args = ["--trust-remote-code"] + + +class TestMinistral3MMMU(MMMUVLMMixin, CustomTestCase): + accuracy = 0.3 + model = MODEL + other_args = ["--trust-remote-code"] + mmmu_args = ["--limit=0.1"] + """`--limit=0.1`: 10 percent of each task - this is fine for testing since the nominal result isn't interesting - this run is just to prevent relative regressions.""" + + def test_vlm_mmmu_benchmark(self): + self._run_vlm_mmmu_test( + SimpleNamespace(model=self.model, mmmu_accuracy=self.accuracy), "./logs" + ) + +if __name__ == "__main__": + unittest.main() + diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 49655d22045f..c74cf81cfad2 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -24,6 +24,7 @@ TestFile("models/test_embedding_models.py", 73), TestFile("models/test_encoder_embedding_models.py", 460), TestFile("models/test_generation_models.py", 103), + TestFile("models/test_ministral3_models.py", 350), TestFile("models/test_nvidia_nemotron_nano_v2.py", 160), TestFile("models/test_nvidia_nemotron_nano_v2_vl.py", 350), # GSM8k + MMMU TestFile("models/test_qwen_models.py", 150), From 93a13ed7333cf229116973af6aece52dd3038007 Mon Sep 17 00:00:00 2001 From: Xinyuan Tong Date: Thu, 4 Dec 2025 14:35:04 +0000 Subject: [PATCH 08/11] remove submodule FlashMLA --- FlashMLA | 1 - 1 file changed, 1 deletion(-) delete mode 160000 FlashMLA diff --git a/FlashMLA b/FlashMLA deleted file mode 160000 index 1408756a88e5..000000000000 --- a/FlashMLA +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 1408756a88e52a25196b759eaf8db89d2b51b5a1 From fc69f4a78c11b205d570ecd149964e30903366db Mon Sep 17 00:00:00 2001 From: Xinyuan Tong Date: Thu, 4 Dec 2025 14:35:34 +0000 Subject: [PATCH 09/11] lint Signed-off-by: Xinyuan Tong --- test/srt/models/test_ministral3_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/srt/models/test_ministral3_models.py b/test/srt/models/test_ministral3_models.py index 27ddd7d660e7..6f6ca51b4e8d 100644 --- a/test/srt/models/test_ministral3_models.py +++ b/test/srt/models/test_ministral3_models.py @@ -15,7 +15,7 @@ class TestMinistral3TextOnly(GSM8KMixin, CustomTestCase): class TestMinistral3MMMU(MMMUVLMMixin, CustomTestCase): - accuracy = 0.3 + accuracy = 0.3 model = MODEL other_args = ["--trust-remote-code"] mmmu_args = ["--limit=0.1"] @@ -26,6 +26,6 @@ def test_vlm_mmmu_benchmark(self): SimpleNamespace(model=self.model, mmmu_accuracy=self.accuracy), "./logs" ) + if __name__ == "__main__": unittest.main() - From 1e8fd6435b61b0ef3c68952c10b23cec5763fd86 Mon Sep 17 00:00:00 2001 From: Xinyuan Tong Date: Thu, 4 Dec 2025 15:04:15 +0000 Subject: [PATCH 10/11] update Signed-off-by: Xinyuan Tong --- test/srt/run_suite.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index b37866c787e4..65e11705aa3c 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -214,6 +214,7 @@ TestFile("test_moe_eval_accuracy_large.py"), TestFile("test_vision_openai_server_common.py"), TestFile("test_profile_v2.py"), + TestFile("models/test_ministral3_models.py"), ], } From 44beef7de1fc9284005bbf5fdad88aa47d51c96e Mon Sep 17 00:00:00 2001 From: Xinyuan Tong Date: Thu, 4 Dec 2025 16:00:59 +0000 Subject: [PATCH 11/11] fix processor conflict Signed-off-by: Xinyuan Tong --- python/sglang/srt/multimodal/processors/pixtral.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/multimodal/processors/pixtral.py b/python/sglang/srt/multimodal/processors/pixtral.py index 6b6ab34ad1a0..b923ff342a19 100644 --- a/python/sglang/srt/multimodal/processors/pixtral.py +++ b/python/sglang/srt/multimodal/processors/pixtral.py @@ -56,7 +56,8 @@ def __init__(self, hf_config, server_args, _processor, *args, **kwargs): self.patch_size = self.vision_config.patch_size self._processor.patch_size = self.patch_size - self._processor.spatial_merge_size = self.vision_config.spatial_merge_size + if hasattr(self.vision_config, "spatial_merge_size"): + self._processor.spatial_merge_size = self.vision_config.spatial_merge_size self.mm_tokens = MultimodalSpecialTokens( image_token=_processor.image_token,