diff --git a/python/sglang/srt/models/gemma4_causal.py b/python/sglang/srt/models/gemma4_causal.py index 190452fcd124..c406f12a2b6c 100644 --- a/python/sglang/srt/models/gemma4_causal.py +++ b/python/sglang/srt/models/gemma4_causal.py @@ -1147,7 +1147,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("experts.w13_weight", "experts.gate_up_proj", ("w1", "w3")), ("experts.w2_weight", "experts.down_proj", ("w2",)), ] - num_experts = self.config.num_experts + # Dense subclasses (e.g. the Gemma4 MTP assistant) reuse this. + num_experts = getattr(self.config, "num_experts", None) or 0 # Per-expert checkpoint format used by compressed-tensors / FP8 # (e.g. RedHatAI/*-FP8-Dynamic) and by ModelOpt NVFP4 @@ -1159,11 +1160,15 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # in a trailing dot, so the standard `name.replace(weight_name, # param_name)` collapses every suffix uniformly to the fused # FusedMoE params (experts.w13_*, experts.w2_*). - per_expert_params_mapping = FusedMoE.make_expert_params_mapping( - ckpt_gate_proj_name="gate_proj", - ckpt_down_proj_name="down_proj", - ckpt_up_proj_name="up_proj", - num_experts=num_experts, + per_expert_params_mapping = ( + FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=num_experts, + ) + if num_experts + else [] ) k_eq_v_layers = self._get_k_eq_v_layers() diff --git a/python/sglang/srt/models/gemma4_mtp.py b/python/sglang/srt/models/gemma4_mtp.py index 1cb87b7c2e99..ade10ce5b990 100644 --- a/python/sglang/srt/models/gemma4_mtp.py +++ b/python/sglang/srt/models/gemma4_mtp.py @@ -21,6 +21,7 @@ from torch import nn from transformers import PretrainedConfig, PreTrainedModel +from sglang.srt.distributed import get_pp_group from sglang.srt.layers.linear import ReplicatedLinear from sglang.srt.layers.logits_processor import ( LogitsMetadata, @@ -72,6 +73,7 @@ def __init__( self.assistant_config = config self.config = text_config self.quant_config = quant_config + self.pp_group = get_pp_group() self.vocab_size = text_config.vocab_size self.hidden_size = text_config.hidden_size diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 1d1b8d29959d..2d203ad03cfd 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -2232,11 +2232,13 @@ def _handle_model_specific_adjustments(self): ) if is_sm100_supported() and self.moe_runner_backend == "auto": - - self.moe_runner_backend = "flashinfer_trtllm" - logger.info( - "Use flashinfer_trtllm as MoE runner backend on SM100 for Gemma-4 NVFP4" - ) + if self.get_model_config().quantization == "modelopt_fp4": + self.quantization = "modelopt_fp4" + self.moe_runner_backend = "flashinfer_trtllm" + logger.info( + "Use flashinfer_trtllm as MoE runner backend on " + "SM100 for Gemma-4 (modelopt_fp4)" + ) elif model_arch == "MossVLForConditionalGeneration": if self.is_attention_backend_not_set(): self.prefill_attention_backend = "flashinfer"