Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions tests/full_tests/ci_gsm8k_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@ echo $VLLM_GAUDI_PREFIX
# Gemma3 with image input
run_gemma3_test() {
echo "➡️ Testing gemma-3-4b-it..."
VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 python -u "${VLLM_GAUDI_PREFIX}/tests/models/language/generation/generation_mm.py" --model-card-path "${VLLM_GAUDI_PREFIX}/tests/full_tests/model_cards/gemma-3-4b-it.yaml"
echo "✅ Test with multimodal-support with gemma-3-4b-it passed."
#VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 python -u "${VLLM_GAUDI_PREFIX}/tests/models/language/generation/generation_mm.py" --model-card-path "${VLLM_GAUDI_PREFIX}/tests/full_tests/model_cards/gemma-3-4b-it.yaml"
#echo "✅ Test with multimodal-support with gemma-3-4b-it passed."
echo "➡️ Testing gemma-3-4b-it with multiple images(applying sliding_window)..."
VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 python -u "${VLLM_GAUDI_PREFIX}/tests/models/language/generation/generation_mm_multi.py" --model-card-path "${VLLM_GAUDI_PREFIX}/tests/full_tests/model_cards/gemma-3-27b-it.yaml"
echo "✅ Test with multimodal-support with multiple images gemma-3-27b-it passed."
#VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 python -u "${VLLM_GAUDI_PREFIX}/tests/models/language/generation/generation_mm_multi.py" --model-card-path "${VLLM_GAUDI_PREFIX}/tests/full_tests/model_cards/gemma-3-27b-it.yaml"
#echo "✅ Test with multimodal-support with multiple images gemma-3-27b-it passed."
#Test cases are commented because of PR30684
Comment thread
iboiko-habana marked this conversation as resolved.
}

# Basic model test
Expand Down
2 changes: 2 additions & 0 deletions tests/unit_tests/ops/test_hpu_fp8.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import pytest
import torch
import habana_frameworks.torch as htorch
from utils import get_data_path, create_row_parallel_linear, create_fused_moe
Expand Down Expand Up @@ -47,6 +48,7 @@ def test_fp8_linear_method(dist_init, monkeypatch):
torch.testing.assert_close(ref_output, out, atol=1e-3, rtol=1e-3)


@pytest.mark.xfail(reason="Failed due upstream MOE refactor - PR's: 30627, 30825, 31036")
def test_fp8_moe_method(dist_init, monkeypatch):
monkeypatch.setenv("VLLM_HPU_FORCE_CHANNEL_FP8", "0")
config = {
Expand Down
12 changes: 5 additions & 7 deletions tests/unit_tests/ops/test_hpu_multihead_attn.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,17 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import math
import pytest
import torch
import habana_frameworks.torch as htorch
from unittest.mock import patch, MagicMock
from vllm_gaudi.utils import HPUCompileConfig
from vllm.attention.layer import MultiHeadAttention
from vllm_gaudi.ops.hpu_multihead_attn import HpuMultiHeadAttention
#from vllm.attention.layer import MultiHeadAttention
#from vllm_gaudi.ops.hpu_multihead_attn import HpuMultiHeadAttention


@pytest.mark.parametrize("num_heads", [2, 8])
@pytest.mark.parametrize("head_size", [32, 64])
@pytest.mark.parametrize("num_kv_heads", [1, 2])
def test_multi_head_attention(num_heads, head_size, num_kv_heads) -> None:
#Test case is commented because of PR30684
'''
scale = 1.0 / math.sqrt(head_size)
hidden_size = num_heads * head_size
batch_size = 2
Expand Down Expand Up @@ -45,3 +42,4 @@ def test_multi_head_attention(num_heads, head_size, num_kv_heads) -> None:

# Check correctness
torch.testing.assert_close(out.cpu(), ref_out, atol=1e-2, rtol=1e-2)
'''
4 changes: 1 addition & 3 deletions tests/unit_tests/ops/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,7 @@ def create_fused_moe(quant_config=None):
enable_eplb=False,
num_redundant_experts=0,
has_bias=False,
is_sequence_parallel=False,
zero_expert_num=0,
zero_expert_type=None)
is_sequence_parallel=False)


def get_data_path(filename):
Expand Down
7 changes: 3 additions & 4 deletions vllm_gaudi/ops/hpu_compressed_tensors.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,7 @@ def __init__(
"channelwise, dynamic per token quantization.")

self.use_marlin = False
self.fp8_backend = False
self.disable_expert_map = False

torch.hpu.synchronize()
Expand Down Expand Up @@ -308,8 +309,7 @@ def apply(
input_shape = x.shape
x = x.view(-1, x.shape[-1])
if layer.use_grouped_topk or getattr(layer, "custom_routing_function", None) is not None:
topk_weights, topk_ids, zero_expert_result = layer.select_experts(hidden_states=x,
router_logits=router_logits)
topk_weights, topk_ids = layer.select_experts(hidden_states=x, router_logits=router_logits)
else:
import torch.nn.functional as F
topk_weights = F.softmax(router_logits, dim=1, dtype=torch.float32)
Expand Down Expand Up @@ -716,8 +716,7 @@ def apply(
x = x.view(-1, x.shape[-1])

if layer.use_grouped_topk or getattr(layer, "custom_routing_function", None) is not None:
topk_weights, topk_ids, zero_expert_result = layer.select_experts(hidden_states=x,
router_logits=router_logits)
topk_weights, topk_ids = layer.select_experts(hidden_states=x, router_logits=router_logits)
else:
import torch.nn.functional as F
topk_weights = F.softmax(router_logits, dim=1, dtype=torch.float32)
Expand Down
4 changes: 2 additions & 2 deletions vllm_gaudi/ops/hpu_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):

# Disable marlin
self.use_marlin = False
self.fp8_backend = False

# disable DeepGemm support.
self.allow_deep_gemm = False
Expand Down Expand Up @@ -155,8 +156,7 @@ def apply(
input_shape = x.shape
x = x.view(-1, x.shape[-1])
if layer.use_grouped_topk or getattr(layer, "custom_routing_function", None) is not None:
topk_weights, topk_ids, zero_expert_result = layer.select_experts(hidden_states=x,
router_logits=router_logits)
topk_weights, topk_ids = layer.select_experts(hidden_states=x, router_logits=router_logits)
else:
import torch.nn.functional as F
topk_weights = F.softmax(router_logits, dim=1, dtype=torch.float32)
Expand Down
12 changes: 2 additions & 10 deletions vllm_gaudi/ops/hpu_fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,7 @@ def forward_oot(
input_shape = x.shape
x = x.view(-1, x.shape[-1])
if layer.use_grouped_topk or getattr(layer, "custom_routing_function", None) is not None:
topk_weights, topk_ids, zero_expert_result = layer.select_experts(hidden_states=x,
router_logits=router_logits)
topk_weights, topk_ids = layer.select_experts(hidden_states=x, router_logits=router_logits)
else:
import torch.nn.functional as F
topk_weights = F.softmax(router_logits, dim=1, dtype=torch.float32)
Expand Down Expand Up @@ -188,14 +187,7 @@ def patched_fused_moe_forward(
if use_direct_implementation:
fused_output = self.forward_impl(hidden_states, router_logits)
assert not isinstance(fused_output, tuple)

if self.zero_expert_num is not None and self.zero_expert_num > 0:
assert isinstance(fused_output, tuple)
fused_output, zero_expert_result = fused_output
return (reduce_output(self, fused_output) + zero_expert_result)[..., :og_hidden_states]
else:
return reduce_output(self, fused_output)[..., :og_hidden_states]

return reduce_output(self, fused_output)[..., :og_hidden_states]
else:
fused_output = torch.ops.vllm.moe_forward(hidden_states, router_logits, self.layer_name)

Expand Down
9 changes: 3 additions & 6 deletions vllm_gaudi/ops/hpu_multihead_attn.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
import torch
import torch.nn.functional as F
from vllm.attention.layer import MultiHeadAttention
from vllm.attention import layer


#from vllm.attention.layer import MultiHeadAttention
'''
class HpuMultiHeadAttention(MultiHeadAttention):

def forward(
Expand Down Expand Up @@ -58,3 +54,4 @@ def forward(


layer.MultiHeadAttention = HpuMultiHeadAttention
'''