Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 25 additions & 24 deletions python/sglang/srt/model_executor/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,7 @@ def __init__(

# Model-specific adjustment
self.model_specific_adjustment()
self.check_quantized_moe_compatibility()

# Set the global server_args in the scheduler process
set_global_server_args_for_scheduler(server_args)
Expand Down Expand Up @@ -553,34 +554,34 @@ def model_specific_adjustment(self):
if not server_args.disable_chunked_prefix_cache:
log_info_on_rank0(logger, "Chunked prefix cache is turned on.")

if self.model_config.hf_config.model_type == "qwen3_vl_moe":
if (
quantization_config := getattr(
self.model_config.hf_config, "quantization_config", None
)
) is not None and "weight_block_size" in quantization_config:
weight_block_size_n = quantization_config["weight_block_size"][0]
def check_quantized_moe_compatibility(self):
if (
quantization_config := getattr(
self.model_config.hf_config, "quantization_config", None
)
) is not None and "weight_block_size" in quantization_config:
weight_block_size_n = quantization_config["weight_block_size"][0]

if self.tp_size % self.moe_ep_size != 0:
raise ValueError(
f"tp_size {self.tp_size} must be divisible by moe_ep_size {self.moe_ep_size}"
)
moe_tp_size = self.tp_size // self.moe_ep_size
if self.tp_size % self.moe_ep_size != 0:
raise ValueError(
f"tp_size {self.tp_size} must be divisible by ep_size {self.moe_ep_size}"
)
moe_tp_size = self.tp_size // self.moe_ep_size

moe_intermediate_size = (
self.model_config.hf_text_config.moe_intermediate_size
moe_intermediate_size = (
self.model_config.hf_text_config.moe_intermediate_size
)
if moe_intermediate_size % moe_tp_size != 0:
raise ValueError(
f"moe_intermediate_size {moe_intermediate_size} must be divisible by moe_tp_size ({moe_tp_size}) which is tp_size ({self.tp_size}) divided by moe_ep_size ({self.moe_ep_size})."
)
if moe_intermediate_size % moe_tp_size != 0:
raise ValueError(
f"moe_intermediate_size {moe_intermediate_size} must be divisible by moe_tp_size ({moe_tp_size}) which is tp_size ({self.tp_size}) divided by moe_ep_size ({self.moe_ep_size})."
)

if (moe_intermediate_size // moe_tp_size) % weight_block_size_n != 0:
raise ValueError(
f"For qwen3-vl-fp8 models, please make sure ({moe_intermediate_size=} / {moe_tp_size=}) % {weight_block_size_n=} == 0 "
f"where moe_tp_size is equal to tp_size ({self.tp_size}) divided by moe_ep_size ({self.moe_ep_size}). "
f"You can fix this by setting arguments `--tp-size` and `--ep-size` correctly."
)
if (moe_intermediate_size // moe_tp_size) % weight_block_size_n != 0:
raise ValueError(
f"For quantized MoE models, please make sure ({moe_intermediate_size=} / {moe_tp_size=}) % {weight_block_size_n=} == 0 "
f"where moe_tp_size is equal to tp_size ({self.tp_size}) divided by ep_size ({self.moe_ep_size}). "
f"You can fix this by setting arguments `--tp` and `--ep` correctly."
)

def init_torch_distributed(self):
logger.info("Init torch distributed begin.")
Expand Down
Loading