Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 18 additions & 18 deletions tests/full_tests/ci_gsm8k_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,26 +29,26 @@ fi
echo "Test with deepseek v2 lite passed"

# granite + inc
echo "Testing granite-8b + inc with vllm-hpu plugin v1"
echo QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_unit_scale_quant.json HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model ibm-granite/granite-3.3-2b-instruct --trust-remote-code --quantization inc --kv_cache_dtype fp8_inc
QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_unit_scale_quant.json \
HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model ibm-granite/granite-3.3-2b-instruct --trust-remote-code --quantization inc --kv_cache_dtype fp8_inc
if [ $? -ne 0 ]; then
echo "Error: Test failed for granite + inc" >&2
exit -1
fi
echo "Test with granite + inc passed"
#echo "Testing granite-8b + inc with vllm-hpu plugin v1"
#echo QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_unit_scale_quant.json HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model ibm-granite/granite-3.3-2b-instruct --trust-remote-code --quantization inc --kv_cache_dtype fp8_inc
#QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_unit_scale_quant.json \
#HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model ibm-granite/granite-3.3-2b-instruct --trust-remote-code --quantization inc --kv_cache_dtype fp8_inc
#if [ $? -ne 0 ]; then
# echo "Error: Test failed for granite + inc" >&2
# exit -1
#fi
#echo "Test with granite + inc passed"

# deepseek v2 + inc
echo "Testing deepseek_v2 + inc with vllm-hpu plugin v1"
echo QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_unit_scale_quant.json HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model deepseek-ai/DeepSeek-V2-Lite-Chat --trust-remote-code --quantization inc --kv_cache_dtype fp8_inc
QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_unit_scale_quant.json \
HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model deepseek-ai/DeepSeek-V2-Lite-Chat --trust-remote-code --quantization inc --kv_cache_dtype fp8_inc
if [ $? -ne 0 ]; then
echo "Error: Test failed for deepseek_v2 + inc" >&2
exit -1
fi
echo "Test with deepseek_v2 + inc passed"
#echo "Testing deepseek_v2 + inc with vllm-hpu plugin v1"
#echo QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_unit_scale_quant.json HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model deepseek-ai/DeepSeek-V2-Lite-Chat --trust-remote-code --quantization inc --kv_cache_dtype fp8_inc
#QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_unit_scale_quant.json \
#HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model deepseek-ai/DeepSeek-V2-Lite-Chat --trust-remote-code --quantization inc --kv_cache_dtype fp8_inc
#if [ $? -ne 0 ]; then
# echo "Error: Test failed for deepseek_v2 + inc" >&2
# exit -1
#fi
#echo "Test with deepseek_v2 + inc passed"

# gsm8k test
# used to check HPUattn + MLP
Expand Down
12 changes: 4 additions & 8 deletions vllm_gaudi/attention/backends/hpu_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import os
from dataclasses import dataclass
from typing import Any, Optional
from typing import Optional

import torch
import vllm_gaudi.extension.kernels as kernels
Expand Down Expand Up @@ -161,7 +161,6 @@ def __init__(
alibi_slopes: Optional[list[float]],
sliding_window: Optional[int],
kv_cache_dtype: str,
blocksparse_params: Optional[dict[str, Any]],
logits_soft_cap: Optional[float],
attn_type: str,
kv_sharing_target_layer_name: Optional[str] = None,
Expand All @@ -170,7 +169,7 @@ def __init__(
torch.nn.Module.__init__(self)
MLACommonImpl.__init__(self, num_heads, head_size, scale, num_kv_heads,
alibi_slopes, sliding_window, kv_cache_dtype,
blocksparse_params, logits_soft_cap, attn_type,
logits_soft_cap, attn_type,
kv_sharing_target_layer_name, **kwargs)
self.enable_fp8_attn = kv_cache_dtype == 'fp8_inc' and os.environ.get(
'QUANT_CONFIG', None) is None
Expand All @@ -191,13 +190,11 @@ def __init__(
assert self.prefill_impl != 'fsdpa_impl' or alibi_slopes is None, \
'Prefill with FusedSDPA not supported with alibi slopes!'

unsupported_features = [
alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
]
unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
if any(unsupported_features):
raise NotImplementedError(
"HPUMLAImpl does not support one of the following: "
"alibi_slopes, sliding_window, blocksparse_params, "
"alibi_slopes, sliding_window, "
"logits_soft_cap")

if attn_type != AttentionType.DECODER:
Expand Down Expand Up @@ -379,7 +376,6 @@ def __init__(
alibi_slopes: Optional[list[float]],
sliding_window: Optional[int],
kv_cache_dtype: str,
blocksparse_params: Optional[dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER,
kv_sharing_target_layer_name: Optional[str] = None,
Expand Down