Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion vllm_ascend/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -856,14 +856,23 @@ def is_drafter_moe_model(vllm_config: VllmConfig):

def speculative_enable_dispatch_gmm_combine_decode(
vllm_config: VllmConfig) -> bool:
"""When draft contains MOE Arch and non-w8a8, disable dispatch_gmm_combine_decode."""
if vllm_config.speculative_config is None:
return True
speculative_method = getattr(vllm_config.speculative_config, "method",
None)
if speculative_method in [None, "ngram", "suffix"]:
return True
if speculative_method in ["eagle", "eagle3"]:
return False
if is_drafter_moe_model(vllm_config):
draft_model_config = vllm_config.speculative_config.draft_model_config
hf_text_config = draft_model_config.hf_text_config
quant_type = getattr(hf_text_config, "moe_quantize", None)
if quant_type is None:
quant_type = getattr(hf_text_config, "quantize", None)
return quant_type == "w8a8_dynamic"
else:
return True
if speculative_method == "mtp":
mtp_quant_type = getattr(vllm_config.model_config.hf_text_config,
"mtp_quantize", None)
Expand Down