diff --git a/tests/unit_tests/ops/utils.py b/tests/unit_tests/ops/utils.py index bf31336da7..3dabb9d501 100644 --- a/tests/unit_tests/ops/utils.py +++ b/tests/unit_tests/ops/utils.py @@ -4,9 +4,10 @@ import os import torch import contextlib -from vllm.model_executor.custom_op import CustomOp +import vllm.model_executor.custom_op as custom_op from vllm.model_executor.layers.linear import RowParallelLinear from vllm.model_executor.layers.fused_moe.layer import FusedMoE +#op_registry_oot = op_registry_oot @contextlib.contextmanager @@ -18,12 +19,12 @@ def temporary_op_registry_oot(): of the op. (Because when running tests, if registration happened in one of them, then it is still valid in every other test). """ - old_registry = CustomOp.op_registry_oot - CustomOp.op_registry_oot = {} + old_registry = custom_op.op_registry_oot + custom_op.op_registry_oot = {} try: yield finally: - CustomOp.op_registry_oot = old_registry + custom_op.op_registry_oot = old_registry def register_op(base_cls, oot_cls): @@ -31,7 +32,7 @@ def register_op(base_cls, oot_cls): Manual registration of the oot op. It should be used within temporary_op_registry_oot context manager. """ - CustomOp.op_registry_oot[base_cls.__name__] = oot_cls + custom_op.op_registry_oot[base_cls.__name__] = oot_cls def create_row_parallel_linear(input_size, output_size, quant_config=None): diff --git a/vllm_gaudi/attention/backends/hpu_attn.py b/vllm_gaudi/attention/backends/hpu_attn.py index f0b54760a6..2881e16866 100644 --- a/vllm_gaudi/attention/backends/hpu_attn.py +++ b/vllm_gaudi/attention/backends/hpu_attn.py @@ -246,6 +246,9 @@ def __init__( assert self.prefill_impl != 'fsdpa_impl' or alibi_slopes is None, \ 'Prefill with FusedSDPA not supported with alibi slopes!' self.is_aiter_triton_fp8_bmm_enabled = rocm_aiter_ops.is_fp8bmm_enabled() + # If kv_b_proj_weight is unquantized, quantize it to mxfp4 if supported + self.is_aiter_triton_fp4_bmm_enabled = (rocm_aiter_ops.is_fp4bmm_enabled() + and self.kv_b_proj.weight.dtype == torch.bfloat16) unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] if any(unsupported_features): diff --git a/vllm_gaudi/extension/ops.py b/vllm_gaudi/extension/ops.py index 96e38f3bc8..f0c33b7791 100644 --- a/vllm_gaudi/extension/ops.py +++ b/vllm_gaudi/extension/ops.py @@ -729,7 +729,6 @@ def apply_block_fp8_linear_hpu( input_2d, layer.weight, layer.weight_scale_inv, - layer.input_scale, bias, ) return output.to(dtype=input.dtype).view(*input.shape[:-1], -1) @@ -738,7 +737,6 @@ def apply_block_fp8_linear_hpu( layer.weight, block_size, layer.weight_scale_inv, - input_scale=layer.input_scale, bias=bias, original_M=layer.orig_M, original_N=layer.orig_N, @@ -774,15 +772,10 @@ def apply_fp8_linear_hpu( input: torch.Tensor, weight: torch.Tensor, weight_scale: torch.Tensor, - input_scale: Optional[torch.Tensor] = None, bias: Optional[torch.Tensor] = None, trans_B: bool = True, ): - if input_scale is None: - x_fp8, x_scale = dynamic_quant(input) - else: - x_fp8 = torch.ops.hpu.cast_to_fp8_v2(input, 1.0 / input_scale, False, False, torch.float8_e4m3fn)[0] - x_scale = input_scale + x_fp8, x_scale = dynamic_quant(input) output = torch.ops.hpu.fp8_gemm_v2(A=x_fp8, trans_A=False, B=weight, diff --git a/vllm_gaudi/ops/hpu_compressed_tensors.py b/vllm_gaudi/ops/hpu_compressed_tensors.py index a682fc7921..7755008e3b 100644 --- a/vllm_gaudi/ops/hpu_compressed_tensors.py +++ b/vllm_gaudi/ops/hpu_compressed_tensors.py @@ -6,7 +6,7 @@ from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.linear import WEIGHT_LOADER_V2_SUPPORTED from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, FusedMoEConfig) -from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter +from vllm.model_executor.layers.fused_moe import FusedMoERouter from compressed_tensors.quantization import (QuantizationArgs, QuantizationStrategy) from vllm.model_executor.layers.quantization.utils.w8a8_utils import convert_to_channelwise, all_close_1d @@ -190,11 +190,9 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None): weight_scale = layer.weight_scale.transpose(0, 1) if layer.weight_scale.dim() > 1 else layer.weight_scale - input_scale = getattr(layer, 'input_scale', None) return hpu_ops.apply_fp8_linear_hpu(input=x, weight=layer.weight, weight_scale=weight_scale, - input_scale=input_scale, bias=bias, trans_B=False) @@ -313,7 +311,7 @@ def apply( input_shape = x.shape x = x.view(-1, x.shape[-1]) if layer.use_grouped_topk or getattr(layer, "custom_routing_function", None) is not None: - topk_weights, topk_ids = layer.router.select_experts(hidden_states=x, router_logits=router_logits) + topk_weights, topk_ids = router.select_experts(hidden_states=x, router_logits=router_logits) else: import torch.nn.functional as F topk_weights = F.softmax(router_logits, dim=1, dtype=torch.float32) @@ -721,7 +719,7 @@ def apply( x = x.view(-1, x.shape[-1]) if layer.use_grouped_topk or getattr(layer, "custom_routing_function", None) is not None: - topk_weights, topk_ids = layer.router.select_experts(hidden_states=x, router_logits=router_logits) + topk_weights, topk_ids = router.select_experts(hidden_states=x, router_logits=router_logits) else: import torch.nn.functional as F topk_weights = F.softmax(router_logits, dim=1, dtype=torch.float32) diff --git a/vllm_gaudi/ops/hpu_fp8.py b/vllm_gaudi/ops/hpu_fp8.py index a69b16c98d..8513b6f366 100644 --- a/vllm_gaudi/ops/hpu_fp8.py +++ b/vllm_gaudi/ops/hpu_fp8.py @@ -5,7 +5,7 @@ from vllm_gaudi import envs from torch.nn.parameter import Parameter from vllm.model_executor.layers.fused_moe.layer import FusedMoE -from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter +from vllm.model_executor.layers.fused_moe import FusedMoERouter from vllm.model_executor.layers.quantization import fp8 from vllm.model_executor.layers.quantization.fp8 import (Fp8LinearMethod as OrigFp8LinearMethod, Fp8MoEMethod, @@ -158,7 +158,7 @@ def apply( input_shape = x.shape x = x.view(-1, x.shape[-1]) if layer.use_grouped_topk or getattr(layer, "custom_routing_function", None) is not None: - topk_weights, topk_ids = layer.router.select_experts(hidden_states=x, router_logits=router_logits) + topk_weights, topk_ids = router.select_experts(hidden_states=x, router_logits=router_logits) else: import torch.nn.functional as F topk_weights = F.softmax(router_logits, dim=1, dtype=torch.float32) diff --git a/vllm_gaudi/ops/hpu_fused_moe.py b/vllm_gaudi/ops/hpu_fused_moe.py index 0fa5db5db5..6c358b5e5a 100644 --- a/vllm_gaudi/ops/hpu_fused_moe.py +++ b/vllm_gaudi/ops/hpu_fused_moe.py @@ -4,8 +4,8 @@ import torch import vllm from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant -from vllm.model_executor.layers.fused_moe.fused_moe import GroupedTopk -from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter +from vllm.model_executor.layers.fused_moe.router.grouped_topk_router import GroupedTopk +from vllm.model_executor.layers.fused_moe import (FusedMoERouter) from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, UnquantizedFusedMoEMethod) from vllm_gaudi.extension.ops import (VllmMixtureOfExpertsOp) from vllm_gaudi.extension.runtime import get_config @@ -127,7 +127,7 @@ def forward_oot( input_shape = x.shape x = x.view(-1, x.shape[-1]) if layer.use_grouped_topk or getattr(layer, "custom_routing_function", None) is not None: - topk_weights, topk_ids = layer.router.select_experts(hidden_states=x, router_logits=router_logits) + topk_weights, topk_ids = router.select_experts(hidden_states=x, router_logits=router_logits) else: import torch.nn.functional as F topk_weights = F.softmax(router_logits, dim=1, dtype=torch.float32)