From f8a78d8c092acc2df45c98819adbf4ca6e59ea74 Mon Sep 17 00:00:00 2001 From: Iryna Boiko Date: Fri, 16 Jan 2026 17:57:49 +0200 Subject: [PATCH 01/16] [FIX_FOR_VLLM_LATEST] Fix for is_aiter_triton_fp4_bmm_enabled in mla_attention #32238 Signed-off-by: Iryna Boiko --- vllm_gaudi/attention/backends/hpu_attn.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm_gaudi/attention/backends/hpu_attn.py b/vllm_gaudi/attention/backends/hpu_attn.py index 8d62aa463..a8a2aa9a7 100644 --- a/vllm_gaudi/attention/backends/hpu_attn.py +++ b/vllm_gaudi/attention/backends/hpu_attn.py @@ -240,6 +240,9 @@ def __init__( assert self.prefill_impl != 'fsdpa_impl' or alibi_slopes is None, \ 'Prefill with FusedSDPA not supported with alibi slopes!' self.is_aiter_triton_fp8_bmm_enabled = rocm_aiter_ops.is_fp8bmm_enabled() + # If kv_b_proj_weight is unquantized, quantize it to mxfp4 if supported + self.is_aiter_triton_fp4_bmm_enabled = (rocm_aiter_ops.is_fp4bmm_enabled() + and self.kv_b_proj.weight.dtype == torch.bfloat16) unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] if any(unsupported_features): From 38013863ec0c9093b34dda3315211ee0b663d09b Mon Sep 17 00:00:00 2001 From: Iryna Boiko Date: Mon, 19 Jan 2026 12:59:05 +0200 Subject: [PATCH 02/16] Fix for 30623 Signed-off-by: Iryna Boiko --- vllm_gaudi/ops/hpu_compressed_tensors.py | 2 +- vllm_gaudi/ops/hpu_fp8.py | 2 +- vllm_gaudi/ops/hpu_fused_moe.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm_gaudi/ops/hpu_compressed_tensors.py b/vllm_gaudi/ops/hpu_compressed_tensors.py index a682fc792..f94a309a8 100644 --- a/vllm_gaudi/ops/hpu_compressed_tensors.py +++ b/vllm_gaudi/ops/hpu_compressed_tensors.py @@ -6,7 +6,7 @@ from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.linear import WEIGHT_LOADER_V2_SUPPORTED from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, FusedMoEConfig) -from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter +from vllm.model_executor.layers.fused_moe import FusedMoERouter from compressed_tensors.quantization import (QuantizationArgs, QuantizationStrategy) from vllm.model_executor.layers.quantization.utils.w8a8_utils import convert_to_channelwise, all_close_1d diff --git a/vllm_gaudi/ops/hpu_fp8.py b/vllm_gaudi/ops/hpu_fp8.py index 27cc70b39..d70aae029 100644 --- a/vllm_gaudi/ops/hpu_fp8.py +++ b/vllm_gaudi/ops/hpu_fp8.py @@ -5,7 +5,7 @@ from vllm_gaudi import envs from torch.nn.parameter import Parameter from vllm.model_executor.layers.fused_moe.layer import FusedMoE -from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter +from vllm.model_executor.layers.fused_moe import FusedMoERouter from vllm.model_executor.layers.quantization import fp8 from vllm.model_executor.layers.quantization.fp8 import (Fp8LinearMethod as OrigFp8LinearMethod, Fp8MoEMethod, diff --git a/vllm_gaudi/ops/hpu_fused_moe.py b/vllm_gaudi/ops/hpu_fused_moe.py index a27710fa7..a5d0f2bc4 100644 --- a/vllm_gaudi/ops/hpu_fused_moe.py +++ b/vllm_gaudi/ops/hpu_fused_moe.py @@ -4,8 +4,8 @@ import torch import vllm from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant -from vllm.model_executor.layers.fused_moe.fused_moe import GroupedTopk -from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter +from vllm.model_executor.layers.fused_moe.router.grouped_topk_router import GroupedTopk +from vllm.model_executor.layers.fused_moe import (FusedMoERouter) from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, UnquantizedFusedMoEMethod) from vllm_gaudi.extension.ops import (VllmMixtureOfExpertsOp) from vllm_gaudi.extension.runtime import get_config From 4c8865c94fc51dfe1d80f6c02a8a543d43dd9291 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 20 Jan 2026 14:57:40 +0200 Subject: [PATCH 03/16] Fix for RowParallelLinear' object has no attribute 'input_scale'. Did you mean: 'input_size' Signed-off-by: root --- vllm_gaudi/extension/ops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm_gaudi/extension/ops.py b/vllm_gaudi/extension/ops.py index d02ffc6cc..0cee12a08 100644 --- a/vllm_gaudi/extension/ops.py +++ b/vllm_gaudi/extension/ops.py @@ -729,7 +729,6 @@ def apply_block_fp8_linear_hpu( layer.weight, block_size, layer.weight_scale_inv, - input_scale=layer.input_scale, bias=bias, original_M=layer.orig_M, original_N=layer.orig_N, From d52402b97ec3a7dd98e114ca6589bc2676d775ca Mon Sep 17 00:00:00 2001 From: Agata Dobrzyniewicz <160237065+adobrzyn@users.noreply.github.com> Date: Tue, 20 Jan 2026 14:00:24 +0100 Subject: [PATCH 04/16] Update vllm_gaudi/ops/hpu_fused_moe.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Agata Dobrzyniewicz <160237065+adobrzyn@users.noreply.github.com> --- vllm_gaudi/ops/hpu_fused_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_gaudi/ops/hpu_fused_moe.py b/vllm_gaudi/ops/hpu_fused_moe.py index a5d0f2bc4..477b458da 100644 --- a/vllm_gaudi/ops/hpu_fused_moe.py +++ b/vllm_gaudi/ops/hpu_fused_moe.py @@ -5,7 +5,7 @@ import vllm from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant from vllm.model_executor.layers.fused_moe.router.grouped_topk_router import GroupedTopk -from vllm.model_executor.layers.fused_moe import (FusedMoERouter) +from vllm.model_executor.layers.fused_moe import FusedMoERouter from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, UnquantizedFusedMoEMethod) from vllm_gaudi.extension.ops import (VllmMixtureOfExpertsOp) from vllm_gaudi.extension.runtime import get_config From deca5e149bdff169fc96d1daeb70c48fd6adad72 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 20 Jan 2026 17:10:48 +0200 Subject: [PATCH 05/16] Maybe like that?? Signed-off-by: root --- vllm_gaudi/extension/ops.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/vllm_gaudi/extension/ops.py b/vllm_gaudi/extension/ops.py index 2ff64c87c..f321b7dd7 100644 --- a/vllm_gaudi/extension/ops.py +++ b/vllm_gaudi/extension/ops.py @@ -729,7 +729,6 @@ def apply_block_fp8_linear_hpu( input_2d, layer.weight, layer.weight_scale_inv, - layer.input_scale, bias, ) return output.to(dtype=input.dtype).view(*input.shape[:-1], -1) @@ -773,15 +772,11 @@ def apply_fp8_linear_hpu( input: torch.Tensor, weight: torch.Tensor, weight_scale: torch.Tensor, - input_scale: Optional[torch.Tensor] = None, bias: Optional[torch.Tensor] = None, trans_B: bool = True, ): - if input_scale is None: - x_fp8, x_scale = dynamic_quant(input) - else: - x_fp8 = torch.ops.hpu.cast_to_fp8_v2(input, 1.0 / input_scale, False, False, torch.float8_e4m3fn)[0] - x_scale = input_scale + x_fp8, x_scale = dynamic_quant(input) + output = torch.ops.hpu.fp8_gemm_v2(A=x_fp8, trans_A=False, B=weight, From aa7665af13b154814d49e86822bfcbd0152587e7 Mon Sep 17 00:00:00 2001 From: Agata Dobrzyniewicz <160237065+adobrzyn@users.noreply.github.com> Date: Tue, 20 Jan 2026 16:31:23 +0100 Subject: [PATCH 06/16] Update ops.py Signed-off-by: Agata Dobrzyniewicz <160237065+adobrzyn@users.noreply.github.com> --- vllm_gaudi/extension/ops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm_gaudi/extension/ops.py b/vllm_gaudi/extension/ops.py index f321b7dd7..f0c33b779 100644 --- a/vllm_gaudi/extension/ops.py +++ b/vllm_gaudi/extension/ops.py @@ -776,7 +776,6 @@ def apply_fp8_linear_hpu( trans_B: bool = True, ): x_fp8, x_scale = dynamic_quant(input) - output = torch.ops.hpu.fp8_gemm_v2(A=x_fp8, trans_A=False, B=weight, From 96b45b33dba504cd459699e6e197988d0a47df49 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 20 Jan 2026 17:48:14 +0200 Subject: [PATCH 07/16] Fix? Signed-off-by: root --- vllm_gaudi/ops/hpu_compressed_tensors.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm_gaudi/ops/hpu_compressed_tensors.py b/vllm_gaudi/ops/hpu_compressed_tensors.py index f94a309a8..6228c0a2d 100644 --- a/vllm_gaudi/ops/hpu_compressed_tensors.py +++ b/vllm_gaudi/ops/hpu_compressed_tensors.py @@ -190,11 +190,9 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None): weight_scale = layer.weight_scale.transpose(0, 1) if layer.weight_scale.dim() > 1 else layer.weight_scale - input_scale = getattr(layer, 'input_scale', None) return hpu_ops.apply_fp8_linear_hpu(input=x, weight=layer.weight, weight_scale=weight_scale, - input_scale=input_scale, bias=bias, trans_B=False) From e2b1280adba7f162cc2e6d57dc06dae749b67f08 Mon Sep 17 00:00:00 2001 From: Iryna Boiko Date: Wed, 21 Jan 2026 13:53:55 +0200 Subject: [PATCH 08/16] Fix for #32077, maybe_setup_kv_connector Signed-off-by: Iryna Boiko --- vllm_gaudi/v1/worker/hpu_model_runner.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 541334cd9..739542e39 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -43,7 +43,7 @@ from vllm.config import (VllmConfig, update_config) from vllm.distributed.kv_transfer import (get_kv_transfer_group, has_kv_transfer_group) -from vllm.forward_context import set_forward_context +from vllm.forward_context import get_forward_context, set_forward_context from vllm.model_executor.layers.fused_moe.layer import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.vocab_parallel_embedding import (VocabParallelEmbedding) @@ -93,6 +93,7 @@ from vllm_gaudi.extension.ops import LoraMask as LoraMask from vllm.distributed.kv_transfer.kv_connector.utils import copy_kv_blocks from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import NixlConnectorMetadata +from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase from vllm.v1.core.sched.output import GrammarOutput if TYPE_CHECKING: @@ -3769,6 +3770,20 @@ def _maybe_compile(self, *args, **kwargs): else: self.model = self._compile(self.model) + def maybe_setup_kv_connector(scheduler_output: "SchedulerOutput"): + # Update KVConnector with the KVConnector metadata forward(). + if has_kv_transfer_group(): + kv_connector = get_kv_transfer_group() + assert isinstance(kv_connector, KVConnectorBase) + assert scheduler_output.kv_connector_metadata is not None + kv_connector.bind_connector_metadata(scheduler_output.kv_connector_metadata) + + # Background KV cache transfers happen here. + # These transfers are designed to be async and the requests + # involved may be disjoint from the running requests. + # Do this here to save a collective_rpc. + kv_connector.start_load_kv(get_forward_context()) + def _compile_methods(self): """ Compile methods which are not part of the compiled model i.e. those From 73b91c0c27b920fdb50bd4fcaf51c315f005b702 Mon Sep 17 00:00:00 2001 From: Iryna Boiko Date: Wed, 21 Jan 2026 13:01:01 +0100 Subject: [PATCH 09/16] Update hpu_model_runner.py Signed-off-by: Iryna Boiko --- vllm_gaudi/v1/worker/hpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 38b923e01..fc6e5c97a 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -3770,7 +3770,7 @@ def _maybe_compile(self, *args, **kwargs): else: self.model = self._compile(self.model) - def maybe_setup_kv_connector(scheduler_output: "SchedulerOutput"): + def maybe_setup_kv_connector(self, scheduler_output: "SchedulerOutput"): # Update KVConnector with the KVConnector metadata forward(). if has_kv_transfer_group(): kv_connector = get_kv_transfer_group() From e213f3e06e0bdfbe23ad01ada198a5f553e2f483 Mon Sep 17 00:00:00 2001 From: Iryna Boiko Date: Wed, 21 Jan 2026 13:42:47 +0100 Subject: [PATCH 10/16] Update hpu_model_runner.py - more fixes Signed-off-by: Iryna Boiko --- vllm_gaudi/v1/worker/hpu_model_runner.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index fc6e5c97a..110d1c189 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -3770,6 +3770,7 @@ def _maybe_compile(self, *args, **kwargs): else: self.model = self._compile(self.model) + @staticmethod def maybe_setup_kv_connector(self, scheduler_output: "SchedulerOutput"): # Update KVConnector with the KVConnector metadata forward(). if has_kv_transfer_group(): @@ -3784,6 +3785,17 @@ def maybe_setup_kv_connector(self, scheduler_output: "SchedulerOutput"): # Do this here to save a collective_rpc. kv_connector.start_load_kv(get_forward_context()) + @staticmethod + def maybe_wait_for_kv_save() -> None: + if has_kv_transfer_group(): + get_kv_transfer_group().wait_for_save() + + @staticmethod + def get_finished_kv_transfers(scheduler_output: "SchedulerOutput", ) -> tuple[set[str] | None, set[str] | None]: + if has_kv_transfer_group(): + return get_kv_transfer_group().get_finished(scheduler_output.finished_req_ids) + return None, None + def _compile_methods(self): """ Compile methods which are not part of the compiled model i.e. those From f74815df675d217641ed3aa5f2bc202d95a8ae52 Mon Sep 17 00:00:00 2001 From: Iryna Boiko Date: Wed, 21 Jan 2026 13:43:42 +0100 Subject: [PATCH 11/16] Update hpu_model_runner.py Signed-off-by: Iryna Boiko --- vllm_gaudi/v1/worker/hpu_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 110d1c189..3342868d6 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -3786,12 +3786,12 @@ def maybe_setup_kv_connector(self, scheduler_output: "SchedulerOutput"): kv_connector.start_load_kv(get_forward_context()) @staticmethod - def maybe_wait_for_kv_save() -> None: + def maybe_wait_for_kv_save(self) -> None: if has_kv_transfer_group(): get_kv_transfer_group().wait_for_save() @staticmethod - def get_finished_kv_transfers(scheduler_output: "SchedulerOutput", ) -> tuple[set[str] | None, set[str] | None]: + def get_finished_kv_transfers(self, scheduler_output: "SchedulerOutput", ) -> tuple[set[str] | None, set[str] | None]: if has_kv_transfer_group(): return get_kv_transfer_group().get_finished(scheduler_output.finished_req_ids) return None, None From 62031fdae4994f874b9280eaeb38cfeeec6b466f Mon Sep 17 00:00:00 2001 From: Iryna Boiko Date: Wed, 21 Jan 2026 14:15:59 +0100 Subject: [PATCH 12/16] Update hpu_model_runner.py Signed-off-by: Iryna Boiko --- vllm_gaudi/v1/worker/hpu_model_runner.py | 60 +++++++++++++----------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 3342868d6..367e4843d 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -380,8 +380,38 @@ def apply_model_specific_patches(model): patch_llama4_get_attn_scale(model) +class HpuKVConnectorModelRunnerMixin(KVConnectorModelRunnerMixin): + def __init__(self, model, vllm_config): + super().__init__() + + @staticmethod + def maybe_setup_kv_connector(self, scheduler_output: "SchedulerOutput"): + # Update KVConnector with the KVConnector metadata forward(). + if has_kv_transfer_group(): + kv_connector = get_kv_transfer_group() + assert isinstance(kv_connector, KVConnectorBase) + assert scheduler_output.kv_connector_metadata is not None + kv_connector.bind_connector_metadata(scheduler_output.kv_connector_metadata) + + # Background KV cache transfers happen here. + # These transfers are designed to be async and the requests + # involved may be disjoint from the running requests. + # Do this here to save a collective_rpc. + kv_connector.start_load_kv(get_forward_context()) + + @staticmethod + def maybe_wait_for_kv_save(self) -> None: + if has_kv_transfer_group(): + get_kv_transfer_group().wait_for_save() + + @staticmethod + def get_finished_kv_transfers(self, scheduler_output: "SchedulerOutput", ) -> tuple[set[str] | None, set[str] | None]: + if has_kv_transfer_group(): + return get_kv_transfer_group().get_finished(scheduler_output.finished_req_ids) + return None, None -class HpuModelAdapter(torch.nn.Module, KVConnectorModelRunnerMixin): + +class HpuModelAdapter(torch.nn.Module, HpuKVConnectorModelRunnerMixin): def __init__(self, model, vllm_config): super().__init__() @@ -611,7 +641,7 @@ def get_dp_padding(num_tokens: int, dp_size: int, dp_rank: int) -> int: return max_tokens_across_dp_cpu - num_tokens -class HPUModelRunner(KVConnectorModelRunnerMixin): +class HPUModelRunner(HpuKVConnectorModelRunnerMixin): def __init__( self, @@ -3770,32 +3800,6 @@ def _maybe_compile(self, *args, **kwargs): else: self.model = self._compile(self.model) - @staticmethod - def maybe_setup_kv_connector(self, scheduler_output: "SchedulerOutput"): - # Update KVConnector with the KVConnector metadata forward(). - if has_kv_transfer_group(): - kv_connector = get_kv_transfer_group() - assert isinstance(kv_connector, KVConnectorBase) - assert scheduler_output.kv_connector_metadata is not None - kv_connector.bind_connector_metadata(scheduler_output.kv_connector_metadata) - - # Background KV cache transfers happen here. - # These transfers are designed to be async and the requests - # involved may be disjoint from the running requests. - # Do this here to save a collective_rpc. - kv_connector.start_load_kv(get_forward_context()) - - @staticmethod - def maybe_wait_for_kv_save(self) -> None: - if has_kv_transfer_group(): - get_kv_transfer_group().wait_for_save() - - @staticmethod - def get_finished_kv_transfers(self, scheduler_output: "SchedulerOutput", ) -> tuple[set[str] | None, set[str] | None]: - if has_kv_transfer_group(): - return get_kv_transfer_group().get_finished(scheduler_output.finished_req_ids) - return None, None - def _compile_methods(self): """ Compile methods which are not part of the compiled model i.e. those From 5d324ac02e1595221733a465b77430067c0d25b2 Mon Sep 17 00:00:00 2001 From: Iryna Boiko Date: Wed, 21 Jan 2026 14:22:26 +0100 Subject: [PATCH 13/16] Update hpu_model_runner.py Signed-off-by: Iryna Boiko --- vllm_gaudi/v1/worker/hpu_model_runner.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 0abb253ae..d09bdca48 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -385,7 +385,7 @@ def __init__(self, model, vllm_config): super().__init__() @staticmethod - def maybe_setup_kv_connector(self, scheduler_output: "SchedulerOutput"): + def maybe_setup_kv_connector(scheduler_output: "SchedulerOutput"): # Update KVConnector with the KVConnector metadata forward(). if has_kv_transfer_group(): kv_connector = get_kv_transfer_group() @@ -400,12 +400,12 @@ def maybe_setup_kv_connector(self, scheduler_output: "SchedulerOutput"): kv_connector.start_load_kv(get_forward_context()) @staticmethod - def maybe_wait_for_kv_save(self) -> None: + def maybe_wait_for_kv_save() -> None: if has_kv_transfer_group(): get_kv_transfer_group().wait_for_save() @staticmethod - def get_finished_kv_transfers(self, scheduler_output: "SchedulerOutput", ) -> tuple[set[str] | None, set[str] | None]: + def get_finished_kv_transfers(scheduler_output: "SchedulerOutput", ) -> tuple[set[str] | None, set[str] | None]: if has_kv_transfer_group(): return get_kv_transfer_group().get_finished(scheduler_output.finished_req_ids) return None, None From 8257e3625189059d8e8f57fee8477517c6dc0442 Mon Sep 17 00:00:00 2001 From: Iryna Boiko Date: Wed, 21 Jan 2026 14:29:08 +0100 Subject: [PATCH 14/16] Update hpu_model_runner.py Signed-off-by: Iryna Boiko --- vllm_gaudi/v1/worker/hpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index d09bdca48..677e41d83 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -381,7 +381,7 @@ def apply_model_specific_patches(model): patch_llama4_get_attn_scale(model) class HpuKVConnectorModelRunnerMixin(KVConnectorModelRunnerMixin): - def __init__(self, model, vllm_config): + def __init__(self): super().__init__() @staticmethod From e163f560169dd1c24d8fe6beff303181cdafd6fa Mon Sep 17 00:00:00 2001 From: Iryna Boiko Date: Wed, 21 Jan 2026 15:10:21 +0100 Subject: [PATCH 15/16] Update hpu_model_runner.py Signed-off-by: Iryna Boiko --- vllm_gaudi/v1/worker/hpu_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 677e41d83..317f1321e 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -3339,7 +3339,7 @@ def sample_tokens(self, grammar_output: "GrammarOutput | None") -> ModelRunnerOu else: with set_forward_context(None, self.vllm_config): self.maybe_setup_kv_connector(scheduler_output) - finished_sending, finished_recving = set(), set() + finished_sending, finished_recving = set[str](), set[str]() # NOTE(Chendi): used by spec decode draft model, since we are doing # prefill one by one, so save hidden states as list @@ -3666,7 +3666,7 @@ def sample_tokens(self, grammar_output: "GrammarOutput | None") -> ModelRunnerOu if not warmup_mode: self.maybe_wait_for_kv_save() - finished_sending, finished_recving = self.get_finished_kv_transfers(scheduler_output) + finished_sending, finished_recving = self.get_finished_kv_transfers(scheduler_output) # type: ignore if self.use_async_scheduling: model_runner_output = ModelRunnerOutput( From 6a9b0dba67748fc25e5981c3726c24f41291286b Mon Sep 17 00:00:00 2001 From: Iryna Boiko Date: Wed, 21 Jan 2026 15:21:48 +0100 Subject: [PATCH 16/16] Update hpu_model_runner.py Signed-off-by: Iryna Boiko --- vllm_gaudi/v1/worker/hpu_model_runner.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 317f1321e..d6c0a6e0b 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -380,7 +380,9 @@ def apply_model_specific_patches(model): patch_llama4_get_attn_scale(model) + class HpuKVConnectorModelRunnerMixin(KVConnectorModelRunnerMixin): + def __init__(self): super().__init__()