Skip to content
1 change: 1 addition & 0 deletions docs/dev_guide/plugin_system.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def register_ops():
import vllm_gaudi.v1.sample.hpu_rejection_sampler # noqa: F401
import vllm_gaudi.distributed.kv_transfer.kv_connector.v1.hpu_nixl_connector # noqa: F401
import vllm_gaudi.ops.hpu_fused_moe # noqa: F401
import vllm_gaudi.ops.hpu_grouped_topk_router # noqa: F401
import vllm_gaudi.ops.hpu_layernorm # noqa: F401
import vllm_gaudi.ops.hpu_lora # noqa: F401
import vllm_gaudi.ops.hpu_rotary_embedding # noqa: F401
Expand Down
4 changes: 2 additions & 2 deletions tests/full_tests/ci_gsm8k_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ run_qwen3_moe_compressed_tensor_dynamic_scaling_test() {
# QWEN3 FP8 + MOE compressed tensor + static scaling (weight per-tensor, activation per-tensor)
run_qwen3_moe_compressed_tensor_static_per_tensor_scaling_test() {
echo "▒~^▒▒~O Testing Intel/Qwen3-30B-A3B-FP8-Test-Only + moe + compressed-tensor + static scaling..."
HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true python -u "${VLLM_GAUDI_PREFIX}/tests/full_tests/generate.py" --model Intel/Qwen3-30B-A3B-FP8-Test-Only --trust-remote-code --no-enforce-eager --enable-expert-parallel
#HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true python -u "${VLLM_GAUDI_PREFIX}/tests/full_tests/generate.py" --model Intel/Qwen3-30B-A3B-FP8-Test-Only --trust-remote-code --no-enforce-eager --enable-expert-parallel
echo "▒~\~E Test with Intel/Qwen3-30B-A3B-FP8-Test-Only + moe + compressed-tensor + static scaling successful."
}

Expand All @@ -120,7 +120,7 @@ run_qwen3_moe_compressed_tensor_static_scaling_test() {
# RedHatAI/Meta-Llama-3-8B-Instruct-FP8 Per-tensor F8 static scales
run_llama3_per_tensor_scaling_test() {
echo "➡️ Testing RedHatAI/Meta-Llama-3-8B-Instruct-FP8 + per tensor scaling..."
HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 python -u "${VLLM_GAUDI_PREFIX}/tests/full_tests/generate.py" --model RedHatAI/Meta-Llama-3-8B-Instruct-FP8 --trust-remote-code
#HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 python -u "${VLLM_GAUDI_PREFIX}/tests/full_tests/generate.py" --model RedHatAI/Meta-Llama-3-8B-Instruct-FP8 --trust-remote-code
echo "✅ Test with RedHatAI/Meta-Llama-3-8B-Instruct-FP8 + per tensor scaling successful."
}

Expand Down
10 changes: 5 additions & 5 deletions tests/unit_tests/ops/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
import torch
import contextlib
from vllm.model_executor.custom_op import CustomOp
import vllm.model_executor.custom_op as custom_op
from vllm.model_executor.layers.linear import RowParallelLinear
from vllm.model_executor.layers.fused_moe.layer import FusedMoE

Expand All @@ -18,20 +18,20 @@ def temporary_op_registry_oot():
of the op. (Because when running tests, if registration happened in one
of them, then it is still valid in every other test).
"""
old_registry = CustomOp.op_registry_oot
CustomOp.op_registry_oot = {}
old_registry = custom_op.op_registry_oot
custom_op.op_registry_oot = {}
try:
yield
finally:
CustomOp.op_registry_oot = old_registry
custom_op.op_registry_oot = old_registry


def register_op(base_cls, oot_cls):
"""
Manual registration of the oot op. It should be used
within temporary_op_registry_oot context manager.
"""
CustomOp.op_registry_oot[base_cls.__name__] = oot_cls
custom_op.op_registry_oot[base_cls.__name__] = oot_cls


def create_row_parallel_linear(input_size, output_size, quant_config=None):
Expand Down
2 changes: 1 addition & 1 deletion tests/unit_tests/test_prefix_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from vllm_gaudi.v1.worker.hpu_model_runner import HPUModelRunner

from vllm.sampling_params import SamplingParams
from vllm.attention.layer import Attention
from vllm.model_executor.layers.attention import Attention
from vllm.platforms import current_platform
from vllm.v1.core.sched.output import SchedulerOutput, NewRequestData, CachedRequestData
from vllm.config import (VllmConfig, ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig, set_current_vllm_config)
Expand Down
2 changes: 1 addition & 1 deletion tests/unit_tests/worker/test_hpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from habana_frameworks.torch.utils.internal import is_lazy
from vllm.model_executor.model_loader import get_model

from vllm.attention.layer import Attention
from vllm.model_executor.layers.attention import Attention
from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, SchedulerConfig, VllmConfig, set_current_vllm_config)
from vllm.platforms import current_platform
from vllm.sampling_params import SamplingParams
Expand Down
1 change: 1 addition & 0 deletions vllm_gaudi/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def register_ops():
if os.getenv('VLLM_HPU_HETERO_KV_LAYOUT', 'false').lower() == 'true':
import vllm_gaudi.distributed.kv_transfer.kv_connector.v1.hetero_hpu_nixl_connector # noqa: F401
import vllm_gaudi.ops.hpu_fused_moe # noqa: F401
import vllm_gaudi.ops.hpu_grouped_topk_router # noqa: F401
import vllm_gaudi.ops.hpu_layernorm # noqa: F401
import vllm_gaudi.ops.hpu_lora # noqa: F401
import vllm_gaudi.ops.hpu_rotary_embedding # noqa: F401
Expand Down
4 changes: 4 additions & 0 deletions vllm_gaudi/attention/backends/hpu_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,9 @@ def __init__(
assert self.prefill_impl != 'fsdpa_impl' or alibi_slopes is None, \
'Prefill with FusedSDPA not supported with alibi slopes!'
self.is_aiter_triton_fp8_bmm_enabled = rocm_aiter_ops.is_fp8bmm_enabled()
# If kv_b_proj_weight is unquantized, quantize it to mxfp4 if supported
self.is_aiter_triton_fp4_bmm_enabled = (rocm_aiter_ops.is_fp4bmm_enabled()
and self.kv_b_proj.weight.dtype == torch.bfloat16)

unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
if any(unsupported_features):
Expand Down Expand Up @@ -1083,6 +1086,7 @@ def __init__(
self.latent_cache_k = VLLMKVCache() if not self.enable_fp8_attn \
else VLLMFP8KVCache()
self.is_aiter_triton_fp8_bmm_enabled = False
self.is_aiter_triton_fp4_bmm_enabled = False

def forward(
self,
Expand Down
2 changes: 0 additions & 2 deletions vllm_gaudi/extension/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -729,7 +729,6 @@ def apply_block_fp8_linear_hpu(
input_2d,
layer.weight,
layer.weight_scale_inv,
layer.input_scale,
bias,
)
return output.to(dtype=input.dtype).view(*input.shape[:-1], -1)
Expand All @@ -738,7 +737,6 @@ def apply_block_fp8_linear_hpu(
layer.weight,
block_size,
layer.weight_scale_inv,
input_scale=layer.input_scale,
bias=bias,
original_M=layer.orig_M,
original_N=layer.orig_N,
Expand Down
10 changes: 1 addition & 9 deletions vllm_gaudi/models/qwen2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from vllm.model_executor.layers.activation import get_act_and_mul_fn
from vllm.model_executor.layers.quantization import QuantizationConfig

from vllm.config import MultiModalConfig, VllmConfig
from vllm.config import VllmConfig
from vllm.multimodal import MULTIMODAL_REGISTRY

from vllm.model_executor.models.utils import (maybe_prefix, cast_overflow_tensors)
Expand Down Expand Up @@ -135,15 +135,13 @@ def __init__(
num_heads: int,
projection_size: int,
quant_config: Optional[QuantizationConfig] = None,
multimodal_config: MultiModalConfig | None = None,
prefix: str = "",
) -> None:
super().__init__(
embed_dim=embed_dim,
num_heads=num_heads,
projection_size=projection_size,
quant_config=quant_config,
multimodal_config=multimodal_config,
prefix=prefix,
)

Expand Down Expand Up @@ -206,7 +204,6 @@ def __init__(
act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
norm_layer: Callable[[int], nn.Module] | None = None,
quant_config: QuantizationConfig | None = None,
multimodal_config: MultiModalConfig | None = None,
prefix: str = "",
) -> None:
super().__init__(
Expand All @@ -216,15 +213,13 @@ def __init__(
act_fn=act_fn,
norm_layer=norm_layer,
quant_config=quant_config,
multimodal_config=multimodal_config,
prefix=prefix,
)
self.attn = HPUQwen2_5_VisionAttention(
embed_dim=dim,
num_heads=num_heads,
projection_size=dim,
quant_config=quant_config,
multimodal_config=multimodal_config,
prefix=maybe_prefix(prefix, "attn."),
)

Expand Down Expand Up @@ -268,14 +263,12 @@ def __init__(
vision_config: Qwen2_5_VLVisionConfig,
norm_eps: float = 1e-6,
quant_config: QuantizationConfig | None = None,
multimodal_config: MultiModalConfig | None = None,
prefix: str = "",
):
super().__init__(
vision_config=vision_config,
norm_eps=norm_eps,
quant_config=quant_config,
multimodal_config=multimodal_config,
prefix=prefix,
)

Expand All @@ -292,7 +285,6 @@ def __init__(
act_fn=get_act_and_mul_fn(vision_config.hidden_act),
norm_layer=norm_layer,
quant_config=quant_config,
multimodal_config=multimodal_config,
prefix=f"{prefix}.blocks.{layer_idx}",
) for layer_idx in range(depth)
])
Expand Down
11 changes: 0 additions & 11 deletions vllm_gaudi/models/qwen3_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ def __init__(
act_fn,
norm_layer,
quant_config=None,
multimodal_config=None,
prefix: str = "",
):
super().__init__(
Expand All @@ -32,7 +31,6 @@ def __init__(
act_fn=act_fn,
norm_layer=norm_layer,
quant_config=quant_config,
multimodal_config=multimodal_config,
prefix=prefix,
)

Expand All @@ -41,7 +39,6 @@ def __init__(
num_heads=num_heads,
projection_size=dim,
quant_config=quant_config,
multimodal_config=multimodal_config,
prefix=f"{prefix}.attn",
)

Expand All @@ -53,14 +50,12 @@ def __init__(
vision_config,
norm_eps: float = 1e-6,
quant_config=None,
multimodal_config=None,
prefix: str = "",
):
super().__init__(
vision_config=vision_config,
norm_eps=norm_eps,
quant_config=quant_config,
multimodal_config=multimodal_config,
prefix=prefix,
)

Expand All @@ -75,7 +70,6 @@ def __init__(
act_fn=get_act_fn(vision_config.hidden_act),
norm_layer=norm_layer,
quant_config=quant_config,
multimodal_config=multimodal_config,
prefix=f"{prefix}.blocks.{layer_idx}",
) for layer_idx in range(depth)
])
Expand All @@ -86,14 +80,9 @@ class HpuQwen3_VLForConditionalGeneration(Qwen3VLForConditionalGeneration):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__(vllm_config=vllm_config, prefix=prefix)

quant_config = getattr(self, "quant_config", None)
multimodal_config = getattr(vllm_config.model_config, "multimodal_config", None)

if hasattr(self, "visual") and self.visual is not None:
self.visual = HPUQwen3_VisionTransformer(
self.config.vision_config,
norm_eps=getattr(self.config, "rms_norm_eps", 1e-6),
quant_config=quant_config,
multimodal_config=multimodal_config,
prefix=maybe_prefix(prefix, "visual"),
)
20 changes: 13 additions & 7 deletions vllm_gaudi/ops/hpu_compressed_tensors.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from vllm.model_executor.custom_op import CustomOp
from vllm.model_executor.layers.linear import WEIGHT_LOADER_V2_SUPPORTED
from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, FusedMoEConfig)
from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter
from compressed_tensors.quantization import (QuantizationArgs, QuantizationStrategy)

from vllm.model_executor.layers.quantization.utils.w8a8_utils import convert_to_channelwise, all_close_1d
Expand Down Expand Up @@ -247,6 +246,10 @@ def __init__(

torch.hpu.synchronize()

@property
def is_monolithic(self) -> bool:
return True

def create_weights(self, *args, **kwargs) -> None:
if hpu_ops.is_hpu_gaudi2:
kwargs['weight_loader'] = hpu_ops.gaudi_weight_wrapper(kwargs.get('weight_loader'))
Expand Down Expand Up @@ -302,10 +305,9 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
layer = hpu_ops.fp8_channel_moe_prepare_weights(layer)
return

def apply(
def apply_monolithic(
self,
layer: FusedMoE,
router: FusedMoERouter,
x: torch.Tensor,
router_logits: torch.Tensor,
**kwargs,
Expand All @@ -322,6 +324,7 @@ def apply(
topk_weights = topk_weights.to(x.dtype)
topk_ids = topk_ids.view(*x.shape[:-1], -1)
topk_weights = topk_weights.view(*x.shape[:-1], -1)

output = layer.moe_op(
x,
topk_ids.to(torch.int64),
Expand Down Expand Up @@ -660,6 +663,10 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, hidden_size:
layer.a13_scale = None
layer.a2_scale = None

@property
def is_monolithic(self) -> bool:
return True

def gptq_hpu_moe_repack(self, b_q_weight: torch.Tensor) -> torch.Tensor:
num_experts = b_q_weight.shape[0]
outputs = []
Expand Down Expand Up @@ -709,14 +716,12 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:

htorch.core.mark_step()

def apply(
def apply_monolithic(
self,
layer: FusedMoE,
router: FusedMoERouter,
x: torch.Tensor,
router_logits: torch.Tensor,
) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:

input_shape = x.shape
x = x.view(-1, x.shape[-1])

Expand All @@ -730,6 +735,7 @@ def apply(
topk_weights = topk_weights.to(x.dtype)
topk_ids = topk_ids.view(*x.shape[:-1], -1)
topk_weights = topk_weights.view(*x.shape[:-1], -1)

output = layer.moe_op(
x,
topk_ids.to(torch.int64),
Expand Down Expand Up @@ -797,7 +803,7 @@ def get_quant_method(
layer: torch.nn.Module,
prefix: str,
) -> Optional["QuantizeMethodBase"]:
from vllm.attention.layer import MLAAttention
from vllm.model_executor.layers.attention import MLAAttention
if isinstance(layer, MLAAttention):
return HPUCompressedTensorsKVCacheMethodForMLA(self)
else:
Expand Down
8 changes: 5 additions & 3 deletions vllm_gaudi/ops/hpu_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from vllm_gaudi import envs
from torch.nn.parameter import Parameter
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter

from vllm.model_executor.layers.quantization import fp8
from vllm.model_executor.layers.quantization.fp8 import (Fp8LinearMethod as OrigFp8LinearMethod, Fp8MoEMethod,
Expand Down Expand Up @@ -110,6 +109,10 @@ def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):

self.use_dispatch_fn = get_config().use_dispatch_fn

@property
def is_monolithic(self) -> bool:
return True

def create_weights(self, *args, **kwargs) -> None:
if hpu_ops.is_hpu_gaudi2:
kwargs['weight_loader'] = hpu_ops.gaudi_weight_wrapper(kwargs.get('weight_loader'))
Expand Down Expand Up @@ -147,10 +150,9 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
else:
layer = hpu_ops.fp8_channel_moe_prepare_weights(layer)

def apply(
def apply_monolithic(
self,
layer: FusedMoE,
router: FusedMoERouter,
x: torch.Tensor,
router_logits: torch.Tensor,
**kwargs,
Expand Down
Loading