diff --git a/vllm_gaudi/__init__.py b/vllm_gaudi/__init__.py index 7152da64c..dd376f218 100755 --- a/vllm_gaudi/__init__.py +++ b/vllm_gaudi/__init__.py @@ -112,6 +112,5 @@ def register_models(): import vllm_gaudi.models.utils # noqa: F401 import vllm_gaudi.models.interfaces # noqa: F401 import vllm_gaudi.models.bert # noqa: F401 - import vllm_gaudi.models.roberta # noqa: F401 from .models import register_model register_model() diff --git a/vllm_gaudi/attention/backends/hpu_attn.py b/vllm_gaudi/attention/backends/hpu_attn.py index f61a16357..043a1f06c 100644 --- a/vllm_gaudi/attention/backends/hpu_attn.py +++ b/vllm_gaudi/attention/backends/hpu_attn.py @@ -649,6 +649,9 @@ def forward( dtype=self.alibi_slopes.dtype, ) + if key_cache is None: + return torch.zeros(*output_shape, dtype=query.dtype, device=query.device) + output = HPUPagedAttention.forward_decode(query=query, block_mapping=block_mapping, block_bias=attn_bias, diff --git a/vllm_gaudi/attention/oot_mla.py b/vllm_gaudi/attention/oot_mla.py index 1b6f27d40..5f14c7a82 100644 --- a/vllm_gaudi/attention/oot_mla.py +++ b/vllm_gaudi/attention/oot_mla.py @@ -55,7 +55,7 @@ def forward( attn_metadata = forward_context.attn_metadata if isinstance(attn_metadata, dict): attn_metadata = attn_metadata[self.layer_name] - self_kv_cache = self.kv_cache[0] + self_kv_cache = self.kv_cache #slot_mapping = forward_context.slot_mapping #assert isinstance(slot_mapping, dict), ( diff --git a/vllm_gaudi/models/qwen3_5.py b/vllm_gaudi/models/qwen3_5.py index 3b58089e0..d8ffd0e0a 100644 --- a/vllm_gaudi/models/qwen3_5.py +++ b/vllm_gaudi/models/qwen3_5.py @@ -70,9 +70,8 @@ def _extract_metadata(self, num_tokens): is_prompt = bool(getattr(attn_metadata, "is_prompt", False)) state_indices = self._resolve_state_indices(attn_metadata) - self_kv_cache = self.kv_cache[0] - conv_state = self_kv_cache[0] - ssm_state = self_kv_cache[1] + conv_state = self.kv_cache[0] + ssm_state = self.kv_cache[1] query_start_loc = attn_metadata.query_start_loc_p has_initial_state = getattr(attn_metadata, "has_initial_states_p", None) diff --git a/vllm_gaudi/models/roberta.py b/vllm_gaudi/models/roberta.py deleted file mode 100644 index cd1b9f989..000000000 --- a/vllm_gaudi/models/roberta.py +++ /dev/null @@ -1,28 +0,0 @@ -import torch -from vllm.sequence import IntermediateTensors -from vllm.model_executor.models.bert import TOKEN_TYPE_SHIFT -from vllm.model_executor.models.roberta import RobertaForSequenceClassification, replace_roberta_positions - - -def patched_RobertaForSequenceClassification_forward( - self, - input_ids: torch.Tensor | None, - positions: torch.Tensor, - intermediate_tensors: IntermediateTensors | None = None, - inputs_embeds: torch.Tensor | None = None, - token_type_ids: torch.Tensor | None = None, -) -> torch.Tensor: - replace_roberta_positions(input_ids=input_ids, position_ids=positions, padding_idx=self.padding_idx) - if token_type_ids is not None: - assert self.roberta.config.vocab_size < (1 << TOKEN_TYPE_SHIFT) - assert input_ids is not None - - return self.roberta( - input_ids=input_ids, - positions=positions, - inputs_embeds=inputs_embeds, - intermediate_tensors=intermediate_tensors, - ) - - -RobertaForSequenceClassification.forward = patched_RobertaForSequenceClassification_forward diff --git a/vllm_gaudi/ops/hpu_attention.py b/vllm_gaudi/ops/hpu_attention.py index 2f2821b40..5afcb2c25 100644 --- a/vllm_gaudi/ops/hpu_attention.py +++ b/vllm_gaudi/ops/hpu_attention.py @@ -53,7 +53,7 @@ def patched_attention_forward( attn_metadata = forward_context.attn_metadata if isinstance(attn_metadata, dict): attn_metadata = attn_metadata[self.layer_name] - self_kv_cache = self.kv_cache[0] + self_kv_cache = self.kv_cache return self.impl.forward(self, query, key, value, self_kv_cache, attn_metadata) diff --git a/vllm_gaudi/ops/hpu_grouped_topk_router.py b/vllm_gaudi/ops/hpu_grouped_topk_router.py index c6ae43bea..ab32acb59 100644 --- a/vllm_gaudi/ops/hpu_grouped_topk_router.py +++ b/vllm_gaudi/ops/hpu_grouped_topk_router.py @@ -5,8 +5,6 @@ import vllm from vllm import envs as envs -from vllm.model_executor.layers.batch_invariant import ( - vllm_is_batch_invariant, ) from vllm.model_executor.utils import maybe_disable_graph_partition from vllm.platforms import current_platform from vllm.model_executor.layers.fused_moe.router.grouped_topk_router import (GroupedTopk, fused_grouped_topk) @@ -55,7 +53,7 @@ def grouped_topk( raise ValueError(f"Unsupported scoring function: {scoring_func}") # For batch invariance, use sorted=True to ensure deterministic expert selection - use_sorted = vllm_is_batch_invariant() + use_sorted = envs.VLLM_BATCH_INVARIANT num_token = scores.size(0) @@ -130,7 +128,7 @@ def forward_oot( raise ValueError(f"Unsupported scoring function: {self.scoring_func}") # For batch invariance, use sorted=True to ensure deterministic expert selection - use_sorted = vllm_is_batch_invariant() + use_sorted = envs.VLLM_BATCH_INVARIANT num_token = scores.size(0) if e_score_correction_bias is not None: diff --git a/vllm_gaudi/v1/kv_offload/worker/cpu_hpu.py b/vllm_gaudi/v1/kv_offload/worker/cpu_hpu.py index 3e1106901..4b1c7f6c5 100644 --- a/vllm_gaudi/v1/kv_offload/worker/cpu_hpu.py +++ b/vllm_gaudi/v1/kv_offload/worker/cpu_hpu.py @@ -18,7 +18,7 @@ OffloadingHandler, TransferSpec, ) -from vllm.v1.kv_offload.cpu import CPUOffloadingSpec +from vllm.v1.kv_offload.cpu.spec import CPUOffloadingSpec from vllm.v1.kv_offload.abstract import LoadStoreSpec from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec from vllm.v1.kv_offload.worker.cpu_gpu import (SingleDirectionOffloadingHandler, CpuGpuOffloadingHandlers)