-
Notifications
You must be signed in to change notification settings - Fork 3.4k
Fix MTP with Deepseek R1 Fp4 #7376
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
65490a2
c653b1b
bf8af63
241d55f
9d6da0f
357bbf8
1baaaea
6c2c8a7
055ee31
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2201,7 +2201,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], is_nextn=Fal | |
| q_a_proj_weight = cached_a_proj[q_a_proj_name] | ||
| kv_a_proj_weight = cached_a_proj[kv_a_proj_name] | ||
| cat_dim = 0 | ||
| if ( | ||
| if self.quant_config is not None and ( | ||
| self.quant_config.get_name() == "awq" | ||
| or self.quant_config.get_name() == "moe_wna16" | ||
| ): | ||
|
||
|
|
@@ -2232,6 +2232,13 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], is_nextn=Fal | |
| for scale in ["k_scale", "v_scale"]: | ||
| if scale in name: | ||
| name = name.replace(f"{scale[0]}_proj", "attn_mqa") | ||
| break | ||
|
||
| if name not in params_dict: | ||
| # modelopt ckpt contains not needed weights for MTP module: | ||
| # model.decoder.self_attn.attn_mqa.v_scale and | ||
| # model.decoder.self_attn.attn_mqa.k_scale | ||
| logger.warning(f"{name} not found in params_dict.") | ||
| continue | ||
|
||
| param = params_dict[name] | ||
| weight_loader = getattr( | ||
| param, "weight_loader", default_weight_loader | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@pyc96 what was the justification for adding this? Does Flashinfer's Cutluss MoE implementation only support quantized models? At the bottom of this Flashinfer documentation it says that Flashinfer Cutlass Moe backend supports most data types: https://docs.flashinfer.ai/generated/flashinfer.fused_moe.cutlass_fused_moe.html#flashinfer.fused_moe.cutlass_fused_moe
Thanks!