Skip to content
11 changes: 6 additions & 5 deletions tests/unit_tests/ops/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
import os
import torch
import contextlib
from vllm.model_executor.custom_op import CustomOp
import vllm.model_executor.custom_op as custom_op
from vllm.model_executor.layers.linear import RowParallelLinear
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
#op_registry_oot = op_registry_oot


@contextlib.contextmanager
Expand All @@ -18,20 +19,20 @@ def temporary_op_registry_oot():
of the op. (Because when running tests, if registration happened in one
of them, then it is still valid in every other test).
"""
old_registry = CustomOp.op_registry_oot
CustomOp.op_registry_oot = {}
old_registry = custom_op.op_registry_oot
custom_op.op_registry_oot = {}
try:
yield
finally:
CustomOp.op_registry_oot = old_registry
custom_op.op_registry_oot = old_registry


def register_op(base_cls, oot_cls):
"""
Manual registration of the oot op. It should be used
within temporary_op_registry_oot context manager.
"""
CustomOp.op_registry_oot[base_cls.__name__] = oot_cls
custom_op.op_registry_oot[base_cls.__name__] = oot_cls


def create_row_parallel_linear(input_size, output_size, quant_config=None):
Expand Down
3 changes: 3 additions & 0 deletions vllm_gaudi/attention/backends/hpu_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,9 @@ def __init__(
assert self.prefill_impl != 'fsdpa_impl' or alibi_slopes is None, \
'Prefill with FusedSDPA not supported with alibi slopes!'
self.is_aiter_triton_fp8_bmm_enabled = rocm_aiter_ops.is_fp8bmm_enabled()
# If kv_b_proj_weight is unquantized, quantize it to mxfp4 if supported
self.is_aiter_triton_fp4_bmm_enabled = (rocm_aiter_ops.is_fp4bmm_enabled()
and self.kv_b_proj.weight.dtype == torch.bfloat16)

unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
if any(unsupported_features):
Expand Down
9 changes: 1 addition & 8 deletions vllm_gaudi/extension/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -729,7 +729,6 @@ def apply_block_fp8_linear_hpu(
input_2d,
layer.weight,
layer.weight_scale_inv,
layer.input_scale,
bias,
)
return output.to(dtype=input.dtype).view(*input.shape[:-1], -1)
Expand All @@ -738,7 +737,6 @@ def apply_block_fp8_linear_hpu(
layer.weight,
block_size,
layer.weight_scale_inv,
input_scale=layer.input_scale,
bias=bias,
original_M=layer.orig_M,
original_N=layer.orig_N,
Expand Down Expand Up @@ -774,15 +772,10 @@ def apply_fp8_linear_hpu(
input: torch.Tensor,
weight: torch.Tensor,
weight_scale: torch.Tensor,
input_scale: Optional[torch.Tensor] = None,
bias: Optional[torch.Tensor] = None,
trans_B: bool = True,
):
if input_scale is None:
x_fp8, x_scale = dynamic_quant(input)
else:
x_fp8 = torch.ops.hpu.cast_to_fp8_v2(input, 1.0 / input_scale, False, False, torch.float8_e4m3fn)[0]
x_scale = input_scale
x_fp8, x_scale = dynamic_quant(input)
output = torch.ops.hpu.fp8_gemm_v2(A=x_fp8,
trans_A=False,
B=weight,
Expand Down
8 changes: 3 additions & 5 deletions vllm_gaudi/ops/hpu_compressed_tensors.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from vllm.model_executor.custom_op import CustomOp
from vllm.model_executor.layers.linear import WEIGHT_LOADER_V2_SUPPORTED
from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, FusedMoEConfig)
from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter
from vllm.model_executor.layers.fused_moe import FusedMoERouter
from compressed_tensors.quantization import (QuantizationArgs, QuantizationStrategy)

from vllm.model_executor.layers.quantization.utils.w8a8_utils import convert_to_channelwise, all_close_1d
Expand Down Expand Up @@ -190,11 +190,9 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,

def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None):
weight_scale = layer.weight_scale.transpose(0, 1) if layer.weight_scale.dim() > 1 else layer.weight_scale
input_scale = getattr(layer, 'input_scale', None)
return hpu_ops.apply_fp8_linear_hpu(input=x,
weight=layer.weight,
weight_scale=weight_scale,
input_scale=input_scale,
bias=bias,
trans_B=False)

Expand Down Expand Up @@ -313,7 +311,7 @@ def apply(
input_shape = x.shape
x = x.view(-1, x.shape[-1])
if layer.use_grouped_topk or getattr(layer, "custom_routing_function", None) is not None:
topk_weights, topk_ids = layer.router.select_experts(hidden_states=x, router_logits=router_logits)
topk_weights, topk_ids = router.select_experts(hidden_states=x, router_logits=router_logits)
else:
import torch.nn.functional as F
topk_weights = F.softmax(router_logits, dim=1, dtype=torch.float32)
Expand Down Expand Up @@ -721,7 +719,7 @@ def apply(
x = x.view(-1, x.shape[-1])

if layer.use_grouped_topk or getattr(layer, "custom_routing_function", None) is not None:
topk_weights, topk_ids = layer.router.select_experts(hidden_states=x, router_logits=router_logits)
topk_weights, topk_ids = router.select_experts(hidden_states=x, router_logits=router_logits)
else:
import torch.nn.functional as F
topk_weights = F.softmax(router_logits, dim=1, dtype=torch.float32)
Expand Down
4 changes: 2 additions & 2 deletions vllm_gaudi/ops/hpu_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from vllm_gaudi import envs
from torch.nn.parameter import Parameter
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter
from vllm.model_executor.layers.fused_moe import FusedMoERouter

from vllm.model_executor.layers.quantization import fp8
from vllm.model_executor.layers.quantization.fp8 import (Fp8LinearMethod as OrigFp8LinearMethod, Fp8MoEMethod,
Expand Down Expand Up @@ -158,7 +158,7 @@ def apply(
input_shape = x.shape
x = x.view(-1, x.shape[-1])
if layer.use_grouped_topk or getattr(layer, "custom_routing_function", None) is not None:
topk_weights, topk_ids = layer.router.select_experts(hidden_states=x, router_logits=router_logits)
topk_weights, topk_ids = router.select_experts(hidden_states=x, router_logits=router_logits)
else:
import torch.nn.functional as F
topk_weights = F.softmax(router_logits, dim=1, dtype=torch.float32)
Expand Down
6 changes: 3 additions & 3 deletions vllm_gaudi/ops/hpu_fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
import torch
import vllm
from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
from vllm.model_executor.layers.fused_moe.fused_moe import GroupedTopk
from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter
from vllm.model_executor.layers.fused_moe.router.grouped_topk_router import GroupedTopk
from vllm.model_executor.layers.fused_moe import (FusedMoERouter)
from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, UnquantizedFusedMoEMethod)
from vllm_gaudi.extension.ops import (VllmMixtureOfExpertsOp)
from vllm_gaudi.extension.runtime import get_config
Expand Down Expand Up @@ -127,7 +127,7 @@ def forward_oot(
input_shape = x.shape
x = x.view(-1, x.shape[-1])
if layer.use_grouped_topk or getattr(layer, "custom_routing_function", None) is not None:
topk_weights, topk_ids = layer.router.select_experts(hidden_states=x, router_logits=router_logits)
topk_weights, topk_ids = router.select_experts(hidden_states=x, router_logits=router_logits)
else:
import torch.nn.functional as F
topk_weights = F.softmax(router_logits, dim=1, dtype=torch.float32)
Expand Down
Loading