-
Notifications
You must be signed in to change notification settings - Fork 128
[FIX_FOR_VLLM_CUSTOM=ff1f83b056aedcf3e2d978d267011b2b79c08aca] Hourly fixes – batch no. 3 #1053
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a4009d6
7cc4c0c
7bac0a0
eac2ea8
787c236
e57533d
2f98435
72f5beb
ad55166
d594c1d
c31cae0
3f9505e
31e01d3
5876268
60e856e
b83f660
224b005
1e19403
000fbb6
55bb534
3fc47a3
b1849c0
f88594a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,7 @@ | |
| import habana_frameworks.torch.utils.experimental as htexp | ||
| import types | ||
| from vllm.model_executor.layers.fused_moe import FusedMoeWeightScaleSupported | ||
| from vllm.model_executor.layers.fused_moe.activation import MoEActivation | ||
| from vllm.model_executor.layers.quantization.utils import replace_parameter | ||
| from vllm.model_executor.layers.quantization import get_quantization_config as vllm_get_quantization_config | ||
| from vllm.model_executor.layers.quantization.base_config import QuantizationConfig | ||
|
|
@@ -33,6 +34,13 @@ | |
| MAX_EXPERTS_PER_SLICE = int(os.environ.get("MAX_EXPERTS_PER_SLICE", -1)) | ||
|
|
||
|
|
||
| def _as_activation_str(activation): | ||
| """Normalize activation to string for HPU custom op.""" | ||
| if isinstance(activation, MoEActivation): | ||
| return activation.value | ||
| return activation | ||
|
|
||
|
|
||
| def get_inc_quant_method(layer): | ||
| return layer | ||
|
|
||
|
|
@@ -626,6 +634,7 @@ def __init__(self, | |
|
|
||
| def forward(self, hidden_states, expert_routing_table, router_weights, permuted_weights=True, activation="silu"): | ||
| tokens_num, _ = hidden_states.shape | ||
| activation = _as_activation_str(activation) | ||
| kwargs = self._get_extra_kwargs(tokens_num) | ||
| # pre-processing for custom op inputs | ||
| experts_range = range(self.num_experts) | ||
|
|
@@ -936,23 +945,20 @@ def fp8_perchannel_linear_postprocess_weights(layer): | |
|
|
||
|
|
||
| def fp8_block_linear_postprocess_weights(layer, force_channel_fp8=False): | ||
| weight_scale_name = "weight_scale" if hasattr(layer, "weight_scale") else "weight_scale_inv" | ||
| weight_scale_inv = getattr(layer, weight_scale_name).data | ||
| weight_block_size = layer.weight_block_size if hasattr( | ||
| layer, 'weight_block_size') else layer.quant_config.weight_block_size | ||
| weight, orig_M, orig_N = pad_block_fp8_weight_naive(layer.weight.data, weight_scale_inv, weight_block_size) | ||
| weight, orig_M, orig_N = pad_block_fp8_weight_naive(layer.weight.data, layer.weight_scale_inv.data, | ||
| layer.quant_config.weight_block_size) | ||
|
Comment on lines
947
to
+949
|
||
| if force_channel_fp8: | ||
| # convert to channel-wise fp8 | ||
| weight, weight_scale_inv = dynamic_quant( | ||
| dequant_block_fp8_weight_naive(weight, | ||
| weight_scale_inv.data, | ||
| weight_block_size, | ||
| layer.weight_scale_inv.data, | ||
| layer.quant_config.weight_block_size, | ||
| original_M=orig_M, | ||
| original_N=orig_N, | ||
| do_unpad=True)) | ||
| weight_scale_inv = weight_scale_inv.squeeze(-1) | ||
| layer.weight.data.copy_(weight) | ||
| replace_parameter(layer, weight_scale_name, torch.nn.Parameter(weight_scale_inv, requires_grad=False)) | ||
| layer.weight_scale_inv = torch.nn.Parameter(weight_scale_inv, requires_grad=False) | ||
| htorch.core.mark_step() | ||
| return layer | ||
| else: | ||
|
|
@@ -969,35 +975,30 @@ def fp8_block_linear_postprocess_weights(layer, force_channel_fp8=False): | |
|
|
||
|
|
||
| def fp8_block_moe_prepare_weights(layer, force_channel_fp8=False): | ||
| w13_weight_scale_name = "w13_weight_scale" if hasattr(layer, "w13_weight_scale") else "w13_weight_scale_inv" | ||
| w2_weight_scale_name = "w2_weight_scale" if hasattr(layer, "w2_weight_scale") else "w2_weight_scale_inv" | ||
| w13_weight_scale_param = getattr(layer, w13_weight_scale_name) | ||
| w2_weight_scale_param = getattr(layer, w2_weight_scale_name) | ||
| weight_block_size = layer.weight_block_size if hasattr( | ||
| layer, 'weight_block_size') else layer.quant_config.weight_block_size | ||
|
|
||
| if force_channel_fp8: | ||
| # convert to channel-wise fp8 | ||
| w13_weight, w13_weight_scale_inv = dynamic_quant( | ||
| dequant_block_fp8_weight_naive(layer.w13_weight.data, w13_weight_scale_param.data, weight_block_size)) | ||
| dequant_block_fp8_weight_naive(layer.w13_weight.data, layer.w13_weight_scale_inv.data, | ||
| layer.quant_config.weight_block_size)) | ||
| w2_weight, w2_weight_scale_inv = dynamic_quant( | ||
| dequant_block_fp8_weight_naive(layer.w2_weight.data, w2_weight_scale_param.data, weight_block_size)) | ||
| dequant_block_fp8_weight_naive(layer.w2_weight.data, layer.w2_weight_scale_inv.data, | ||
| layer.quant_config.weight_block_size)) | ||
| w13_weight_scale_inv, w2_weight_scale_inv \ | ||
| = w13_weight_scale_inv.squeeze(-1), w2_weight_scale_inv.squeeze(-1) | ||
| layer.w13_weight.data.copy_(w13_weight) | ||
| layer.w2_weight.data.copy_(w2_weight) | ||
| replace_parameter(layer, w13_weight_scale_name, torch.nn.Parameter(w13_weight_scale_inv, requires_grad=False)) | ||
| replace_parameter(layer, w2_weight_scale_name, torch.nn.Parameter(w2_weight_scale_inv, requires_grad=False)) | ||
| layer.w13_weight_scale_inv = torch.nn.Parameter(w13_weight_scale_inv, requires_grad=False) | ||
| layer.w2_weight_scale_inv = torch.nn.Parameter(w2_weight_scale_inv, requires_grad=False) | ||
| return fp8_channel_moe_prepare_weights(layer) | ||
|
|
||
| for index in range(layer.moe_op.num_experts): | ||
| layer.moe_op.w13_list[index].set_weight(layer.w13_weight[index]) | ||
| layer.moe_op.w13_list[index].set_scale_inv_fp8(w13_weight_scale_param[index]) | ||
| layer.moe_op.w13_list[index].set_weight_block_size(weight_block_size) | ||
| layer.moe_op.w13_list[index].set_scale_inv_fp8(layer.w13_weight_scale_inv[index]) | ||
| layer.moe_op.w13_list[index].set_weight_block_size(layer.quant_config.weight_block_size) | ||
|
|
||
| layer.moe_op.w2_list[index].set_weight(layer.w2_weight[index]) | ||
| layer.moe_op.w2_list[index].set_scale_inv_fp8(w2_weight_scale_param[index]) | ||
| layer.moe_op.w2_list[index].set_weight_block_size(weight_block_size) | ||
| layer.moe_op.w2_list[index].set_scale_inv_fp8(layer.w2_weight_scale_inv[index]) | ||
| layer.moe_op.w2_list[index].set_weight_block_size(layer.quant_config.weight_block_size) | ||
|
Comment on lines
977
to
+1001
|
||
| htorch.core.mark_step() | ||
| return layer | ||
|
|
||
|
|
@@ -1133,6 +1134,7 @@ def forward( | |
| activation="silu", | ||
| ): | ||
| tokens_num, _ = x.shape | ||
| activation = _as_activation_str(activation) | ||
| kwargs = self._get_extra_kwargs(tokens_num) | ||
| w13_list = [] | ||
| w2_list = [] | ||
|
|
@@ -1198,6 +1200,7 @@ def forward( | |
| activation="silu", | ||
| ): | ||
| tokens_num, _ = x.shape | ||
| activation = _as_activation_str(activation) | ||
| kwargs = self._get_extra_kwargs(tokens_num) | ||
| experts_range = range(self.num_experts) | ||
| w13_list = [self.w13_list[i].weight.squeeze() for i in experts_range] | ||
|
|
@@ -1404,6 +1407,7 @@ def forward( | |
| permuted_weights=True, | ||
| activation="silu", | ||
| ): | ||
| activation = _as_activation_str(activation) | ||
| w13_list = [] | ||
| w2_list = [] | ||
| for j in range(self.num_experts): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
create_request_compatible_with_signatureoverwrites any caller-providedeos_token_idwhen the parameter exists inRequest's signature. To avoid surprising behavior in future tests, only seteos_token_idif it is supported and not already present inrequest_kwargs.