Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
f8a78d8
[FIX_FOR_VLLM_LATEST] Fix for is_aiter_triton_fp4_bmm_enabled in mla_…
iboiko-habana Jan 16, 2026
dcdc2d1
Merge branch 'main' into pr32238
iboiko-habana Jan 19, 2026
3801386
Fix for 30623
iboiko-habana Jan 19, 2026
fdc2070
Merge branch 'main' into pr32238
iboiko-habana Jan 19, 2026
17a2aed
Merge branch 'main' into pr32238
iboiko-habana Jan 19, 2026
11c184b
Merge branch 'main' into pr32238
PatrykWo Jan 20, 2026
4c8865c
Fix for RowParallelLinear' object has no attribute 'input_scale'. Did…
Jan 20, 2026
d52402b
Update vllm_gaudi/ops/hpu_fused_moe.py
adobrzyn Jan 20, 2026
4f8b89f
Merge branch 'main' into adobrzyn/fix_for833
adobrzyn Jan 20, 2026
deca5e1
Maybe like that??
Jan 20, 2026
aa7665a
Update ops.py
adobrzyn Jan 20, 2026
96b45b3
Fix?
Jan 20, 2026
e06d038
Merge branch 'main' into adobrzyn/fix_for833
iboiko-habana Jan 21, 2026
e2b1280
Fix for #32077, maybe_setup_kv_connector
iboiko-habana Jan 21, 2026
0b6533c
Merge branch 'main' into adobrzyn/fix_for833
iboiko-habana Jan 21, 2026
73b91c0
Update hpu_model_runner.py
iboiko-habana Jan 21, 2026
e213f3e
Update hpu_model_runner.py - more fixes
iboiko-habana Jan 21, 2026
f74815d
Update hpu_model_runner.py
iboiko-habana Jan 21, 2026
62031fd
Update hpu_model_runner.py
iboiko-habana Jan 21, 2026
a6b4066
Merge branch 'main' into adobrzyn/fix_for833
iboiko-habana Jan 21, 2026
5d324ac
Update hpu_model_runner.py
iboiko-habana Jan 21, 2026
8257e36
Update hpu_model_runner.py
iboiko-habana Jan 21, 2026
e163f56
Update hpu_model_runner.py
iboiko-habana Jan 21, 2026
6a9b0db
Update hpu_model_runner.py
iboiko-habana Jan 21, 2026
2881616
Merge branch 'main' into adobrzyn/fix_for833
iboiko-habana Jan 22, 2026
6133033
Merge branch 'main' into adobrzyn/fix_for833
iboiko-habana Jan 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions vllm_gaudi/attention/backends/hpu_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,9 @@ def __init__(
assert self.prefill_impl != 'fsdpa_impl' or alibi_slopes is None, \
'Prefill with FusedSDPA not supported with alibi slopes!'
self.is_aiter_triton_fp8_bmm_enabled = rocm_aiter_ops.is_fp8bmm_enabled()
# If kv_b_proj_weight is unquantized, quantize it to mxfp4 if supported
self.is_aiter_triton_fp4_bmm_enabled = (rocm_aiter_ops.is_fp4bmm_enabled()
and self.kv_b_proj.weight.dtype == torch.bfloat16)

unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
if any(unsupported_features):
Expand Down
9 changes: 1 addition & 8 deletions vllm_gaudi/extension/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -729,7 +729,6 @@ def apply_block_fp8_linear_hpu(
input_2d,
layer.weight,
layer.weight_scale_inv,
layer.input_scale,
bias,
)
return output.to(dtype=input.dtype).view(*input.shape[:-1], -1)
Expand All @@ -738,7 +737,6 @@ def apply_block_fp8_linear_hpu(
layer.weight,
block_size,
layer.weight_scale_inv,
input_scale=layer.input_scale,
bias=bias,
original_M=layer.orig_M,
original_N=layer.orig_N,
Expand Down Expand Up @@ -774,15 +772,10 @@ def apply_fp8_linear_hpu(
input: torch.Tensor,
weight: torch.Tensor,
weight_scale: torch.Tensor,
input_scale: Optional[torch.Tensor] = None,
bias: Optional[torch.Tensor] = None,
trans_B: bool = True,
):
if input_scale is None:
x_fp8, x_scale = dynamic_quant(input)
else:
x_fp8 = torch.ops.hpu.cast_to_fp8_v2(input, 1.0 / input_scale, False, False, torch.float8_e4m3fn)[0]
x_scale = input_scale
x_fp8, x_scale = dynamic_quant(input)
output = torch.ops.hpu.fp8_gemm_v2(A=x_fp8,
trans_A=False,
B=weight,
Expand Down
4 changes: 1 addition & 3 deletions vllm_gaudi/ops/hpu_compressed_tensors.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from vllm.model_executor.custom_op import CustomOp
from vllm.model_executor.layers.linear import WEIGHT_LOADER_V2_SUPPORTED
from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, FusedMoEConfig)
from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter
from vllm.model_executor.layers.fused_moe import FusedMoERouter
from compressed_tensors.quantization import (QuantizationArgs, QuantizationStrategy)

from vllm.model_executor.layers.quantization.utils.w8a8_utils import convert_to_channelwise, all_close_1d
Expand Down Expand Up @@ -190,11 +190,9 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,

def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None):
weight_scale = layer.weight_scale.transpose(0, 1) if layer.weight_scale.dim() > 1 else layer.weight_scale
input_scale = getattr(layer, 'input_scale', None)
return hpu_ops.apply_fp8_linear_hpu(input=x,
weight=layer.weight,
weight_scale=weight_scale,
input_scale=input_scale,
bias=bias,
trans_B=False)

Expand Down
2 changes: 1 addition & 1 deletion vllm_gaudi/ops/hpu_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from vllm_gaudi import envs
from torch.nn.parameter import Parameter
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter
from vllm.model_executor.layers.fused_moe import FusedMoERouter

from vllm.model_executor.layers.quantization import fp8
from vllm.model_executor.layers.quantization.fp8 import (Fp8LinearMethod as OrigFp8LinearMethod, Fp8MoEMethod,
Expand Down
4 changes: 2 additions & 2 deletions vllm_gaudi/ops/hpu_fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
import torch
import vllm
from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
from vllm.model_executor.layers.fused_moe.fused_moe import GroupedTopk
from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter
from vllm.model_executor.layers.fused_moe.router.grouped_topk_router import GroupedTopk
from vllm.model_executor.layers.fused_moe import FusedMoERouter
from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, UnquantizedFusedMoEMethod)
from vllm_gaudi.extension.ops import (VllmMixtureOfExpertsOp)
from vllm_gaudi.extension.runtime import get_config
Expand Down
Loading