Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion vllm_gaudi/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,5 @@ def register_models():
import vllm_gaudi.models.utils # noqa: F401
import vllm_gaudi.models.interfaces # noqa: F401
import vllm_gaudi.models.bert # noqa: F401
import vllm_gaudi.models.roberta # noqa: F401
from .models import register_model
register_model()
3 changes: 3 additions & 0 deletions vllm_gaudi/attention/backends/hpu_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,9 @@ def forward(
dtype=self.alibi_slopes.dtype,
)

if key_cache is None:
return torch.zeros(*output_shape, dtype=query.dtype, device=query.device)

output = HPUPagedAttention.forward_decode(query=query,
block_mapping=block_mapping,
block_bias=attn_bias,
Expand Down
2 changes: 1 addition & 1 deletion vllm_gaudi/attention/oot_mla.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def forward(
attn_metadata = forward_context.attn_metadata
if isinstance(attn_metadata, dict):
attn_metadata = attn_metadata[self.layer_name]
self_kv_cache = self.kv_cache[0]
self_kv_cache = self.kv_cache
#slot_mapping = forward_context.slot_mapping

#assert isinstance(slot_mapping, dict), (
Expand Down
5 changes: 2 additions & 3 deletions vllm_gaudi/models/qwen3_5.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,8 @@ def _extract_metadata(self, num_tokens):
is_prompt = bool(getattr(attn_metadata, "is_prompt", False))
state_indices = self._resolve_state_indices(attn_metadata)

self_kv_cache = self.kv_cache[0]
conv_state = self_kv_cache[0]
ssm_state = self_kv_cache[1]
conv_state = self.kv_cache[0]
ssm_state = self.kv_cache[1]

query_start_loc = attn_metadata.query_start_loc_p
has_initial_state = getattr(attn_metadata, "has_initial_states_p", None)
Expand Down
28 changes: 0 additions & 28 deletions vllm_gaudi/models/roberta.py

This file was deleted.

2 changes: 1 addition & 1 deletion vllm_gaudi/ops/hpu_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def patched_attention_forward(
attn_metadata = forward_context.attn_metadata
if isinstance(attn_metadata, dict):
attn_metadata = attn_metadata[self.layer_name]
self_kv_cache = self.kv_cache[0]
self_kv_cache = self.kv_cache
return self.impl.forward(self, query, key, value, self_kv_cache, attn_metadata)


Expand Down
6 changes: 2 additions & 4 deletions vllm_gaudi/ops/hpu_grouped_topk_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
import vllm

from vllm import envs as envs
from vllm.model_executor.layers.batch_invariant import (
vllm_is_batch_invariant, )
from vllm.model_executor.utils import maybe_disable_graph_partition
from vllm.platforms import current_platform
from vllm.model_executor.layers.fused_moe.router.grouped_topk_router import (GroupedTopk, fused_grouped_topk)
Expand Down Expand Up @@ -55,7 +53,7 @@ def grouped_topk(
raise ValueError(f"Unsupported scoring function: {scoring_func}")

# For batch invariance, use sorted=True to ensure deterministic expert selection
use_sorted = vllm_is_batch_invariant()
use_sorted = envs.VLLM_BATCH_INVARIANT

num_token = scores.size(0)

Expand Down Expand Up @@ -130,7 +128,7 @@ def forward_oot(
raise ValueError(f"Unsupported scoring function: {self.scoring_func}")

# For batch invariance, use sorted=True to ensure deterministic expert selection
use_sorted = vllm_is_batch_invariant()
use_sorted = envs.VLLM_BATCH_INVARIANT

num_token = scores.size(0)
if e_score_correction_bias is not None:
Expand Down
2 changes: 1 addition & 1 deletion vllm_gaudi/v1/kv_offload/worker/cpu_hpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
OffloadingHandler,
TransferSpec,
)
from vllm.v1.kv_offload.cpu import CPUOffloadingSpec
from vllm.v1.kv_offload.cpu.spec import CPUOffloadingSpec
Comment thread
pawel-olejniczak marked this conversation as resolved.
from vllm.v1.kv_offload.abstract import LoadStoreSpec
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
from vllm.v1.kv_offload.worker.cpu_gpu import (SingleDirectionOffloadingHandler, CpuGpuOffloadingHandlers)
Expand Down
Loading