diff --git a/vllm/model_executor/models/deepseek_v4.py b/vllm/model_executor/models/deepseek_v4.py index 0b762d50fe72..01aa922f3c29 100644 --- a/vllm/model_executor/models/deepseek_v4.py +++ b/vllm/model_executor/models/deepseek_v4.py @@ -715,12 +715,15 @@ def __init__( config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config self.prefix = prefix - if vllm_config.parallel_config.enable_expert_parallel: - self.use_mega_moe = ( - vllm_config.kernel_config.moe_backend == "deep_gemm_mega_moe" + self.use_mega_moe = ( + vllm_config.kernel_config.moe_backend == "deep_gemm_mega_moe" + ) + if self.use_mega_moe and not vllm_config.parallel_config.enable_expert_parallel: + raise NotImplementedError( + "DeepSeek V4 MegaMoE currently requires expert parallel. " + "Enable it with --enable-expert-parallel, or pick a different " + "moe backend." ) - else: - self.use_mega_moe = False self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0) self.hidden_size = config.hidden_size @@ -1223,12 +1226,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config self.config = config - if vllm_config.parallel_config.enable_expert_parallel: - self.use_mega_moe = ( - vllm_config.kernel_config.moe_backend == "deep_gemm_mega_moe" + self.use_mega_moe = ( + vllm_config.kernel_config.moe_backend == "deep_gemm_mega_moe" + ) + if self.use_mega_moe and not vllm_config.parallel_config.enable_expert_parallel: + raise NotImplementedError( + "DeepSeek V4 MegaMoE currently requires expert parallel. " + "Enable it with --enable-expert-parallel, or pick a different " + "moe backend." ) - else: - self.use_mega_moe = False self.vocab_size = config.vocab_size self.hc_eps = config.hc_eps self.hc_mult = config.hc_mult