diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 950d3acfb3d2..49fabfe5eef0 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -297,6 +297,7 @@ def __init__( # Model-specific adjustment self.model_specific_adjustment() + self.check_quantized_moe_compatibility() # Set the global server_args in the scheduler process set_global_server_args_for_scheduler(server_args) @@ -553,34 +554,34 @@ def model_specific_adjustment(self): if not server_args.disable_chunked_prefix_cache: log_info_on_rank0(logger, "Chunked prefix cache is turned on.") - if self.model_config.hf_config.model_type == "qwen3_vl_moe": - if ( - quantization_config := getattr( - self.model_config.hf_config, "quantization_config", None - ) - ) is not None and "weight_block_size" in quantization_config: - weight_block_size_n = quantization_config["weight_block_size"][0] + def check_quantized_moe_compatibility(self): + if ( + quantization_config := getattr( + self.model_config.hf_config, "quantization_config", None + ) + ) is not None and "weight_block_size" in quantization_config: + weight_block_size_n = quantization_config["weight_block_size"][0] - if self.tp_size % self.moe_ep_size != 0: - raise ValueError( - f"tp_size {self.tp_size} must be divisible by moe_ep_size {self.moe_ep_size}" - ) - moe_tp_size = self.tp_size // self.moe_ep_size + if self.tp_size % self.moe_ep_size != 0: + raise ValueError( + f"tp_size {self.tp_size} must be divisible by ep_size {self.moe_ep_size}" + ) + moe_tp_size = self.tp_size // self.moe_ep_size - moe_intermediate_size = ( - self.model_config.hf_text_config.moe_intermediate_size + moe_intermediate_size = ( + self.model_config.hf_text_config.moe_intermediate_size + ) + if moe_intermediate_size % moe_tp_size != 0: + raise ValueError( + f"moe_intermediate_size {moe_intermediate_size} must be divisible by moe_tp_size ({moe_tp_size}) which is tp_size ({self.tp_size}) divided by moe_ep_size ({self.moe_ep_size})." ) - if moe_intermediate_size % moe_tp_size != 0: - raise ValueError( - f"moe_intermediate_size {moe_intermediate_size} must be divisible by moe_tp_size ({moe_tp_size}) which is tp_size ({self.tp_size}) divided by moe_ep_size ({self.moe_ep_size})." - ) - if (moe_intermediate_size // moe_tp_size) % weight_block_size_n != 0: - raise ValueError( - f"For qwen3-vl-fp8 models, please make sure ({moe_intermediate_size=} / {moe_tp_size=}) % {weight_block_size_n=} == 0 " - f"where moe_tp_size is equal to tp_size ({self.tp_size}) divided by moe_ep_size ({self.moe_ep_size}). " - f"You can fix this by setting arguments `--tp-size` and `--ep-size` correctly." - ) + if (moe_intermediate_size // moe_tp_size) % weight_block_size_n != 0: + raise ValueError( + f"For quantized MoE models, please make sure ({moe_intermediate_size=} / {moe_tp_size=}) % {weight_block_size_n=} == 0 " + f"where moe_tp_size is equal to tp_size ({self.tp_size}) divided by ep_size ({self.moe_ep_size}). " + f"You can fix this by setting arguments `--tp` and `--ep` correctly." + ) def init_torch_distributed(self): logger.info("Init torch distributed begin.")