diff --git a/vllm_gaudi/attention/backends/hpu_attn.py b/vllm_gaudi/attention/backends/hpu_attn.py index f0b54760a..2881e1686 100644 --- a/vllm_gaudi/attention/backends/hpu_attn.py +++ b/vllm_gaudi/attention/backends/hpu_attn.py @@ -246,6 +246,9 @@ def __init__( assert self.prefill_impl != 'fsdpa_impl' or alibi_slopes is None, \ 'Prefill with FusedSDPA not supported with alibi slopes!' self.is_aiter_triton_fp8_bmm_enabled = rocm_aiter_ops.is_fp8bmm_enabled() + # If kv_b_proj_weight is unquantized, quantize it to mxfp4 if supported + self.is_aiter_triton_fp4_bmm_enabled = (rocm_aiter_ops.is_fp4bmm_enabled() + and self.kv_b_proj.weight.dtype == torch.bfloat16) unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] if any(unsupported_features): diff --git a/vllm_gaudi/extension/ops.py b/vllm_gaudi/extension/ops.py index 96e38f3bc..f0c33b779 100644 --- a/vllm_gaudi/extension/ops.py +++ b/vllm_gaudi/extension/ops.py @@ -729,7 +729,6 @@ def apply_block_fp8_linear_hpu( input_2d, layer.weight, layer.weight_scale_inv, - layer.input_scale, bias, ) return output.to(dtype=input.dtype).view(*input.shape[:-1], -1) @@ -738,7 +737,6 @@ def apply_block_fp8_linear_hpu( layer.weight, block_size, layer.weight_scale_inv, - input_scale=layer.input_scale, bias=bias, original_M=layer.orig_M, original_N=layer.orig_N, @@ -774,15 +772,10 @@ def apply_fp8_linear_hpu( input: torch.Tensor, weight: torch.Tensor, weight_scale: torch.Tensor, - input_scale: Optional[torch.Tensor] = None, bias: Optional[torch.Tensor] = None, trans_B: bool = True, ): - if input_scale is None: - x_fp8, x_scale = dynamic_quant(input) - else: - x_fp8 = torch.ops.hpu.cast_to_fp8_v2(input, 1.0 / input_scale, False, False, torch.float8_e4m3fn)[0] - x_scale = input_scale + x_fp8, x_scale = dynamic_quant(input) output = torch.ops.hpu.fp8_gemm_v2(A=x_fp8, trans_A=False, B=weight, diff --git a/vllm_gaudi/ops/hpu_compressed_tensors.py b/vllm_gaudi/ops/hpu_compressed_tensors.py index a682fc792..6228c0a2d 100644 --- a/vllm_gaudi/ops/hpu_compressed_tensors.py +++ b/vllm_gaudi/ops/hpu_compressed_tensors.py @@ -6,7 +6,7 @@ from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.linear import WEIGHT_LOADER_V2_SUPPORTED from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, FusedMoEConfig) -from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter +from vllm.model_executor.layers.fused_moe import FusedMoERouter from compressed_tensors.quantization import (QuantizationArgs, QuantizationStrategy) from vllm.model_executor.layers.quantization.utils.w8a8_utils import convert_to_channelwise, all_close_1d @@ -190,11 +190,9 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None): weight_scale = layer.weight_scale.transpose(0, 1) if layer.weight_scale.dim() > 1 else layer.weight_scale - input_scale = getattr(layer, 'input_scale', None) return hpu_ops.apply_fp8_linear_hpu(input=x, weight=layer.weight, weight_scale=weight_scale, - input_scale=input_scale, bias=bias, trans_B=False) diff --git a/vllm_gaudi/ops/hpu_fp8.py b/vllm_gaudi/ops/hpu_fp8.py index a69b16c98..ba50806a3 100644 --- a/vllm_gaudi/ops/hpu_fp8.py +++ b/vllm_gaudi/ops/hpu_fp8.py @@ -5,7 +5,7 @@ from vllm_gaudi import envs from torch.nn.parameter import Parameter from vllm.model_executor.layers.fused_moe.layer import FusedMoE -from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter +from vllm.model_executor.layers.fused_moe import FusedMoERouter from vllm.model_executor.layers.quantization import fp8 from vllm.model_executor.layers.quantization.fp8 import (Fp8LinearMethod as OrigFp8LinearMethod, Fp8MoEMethod, diff --git a/vllm_gaudi/ops/hpu_fused_moe.py b/vllm_gaudi/ops/hpu_fused_moe.py index df860578a..17f24e784 100644 --- a/vllm_gaudi/ops/hpu_fused_moe.py +++ b/vllm_gaudi/ops/hpu_fused_moe.py @@ -4,8 +4,8 @@ import torch import vllm from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant -from vllm.model_executor.layers.fused_moe.fused_moe import GroupedTopk -from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter +from vllm.model_executor.layers.fused_moe.router.grouped_topk_router import GroupedTopk +from vllm.model_executor.layers.fused_moe import FusedMoERouter from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, UnquantizedFusedMoEMethod) from vllm_gaudi.extension.ops import (VllmMixtureOfExpertsOp) from vllm_gaudi.extension.runtime import get_config