From 4f7f6150be30f4e3629a3286c7a2e11f0bff79a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Smyrek?= Date: Wed, 21 Jan 2026 10:17:30 +0100 Subject: [PATCH 1/4] Interleaved sliding window fix (#805) Following reasoning stated in PR: https://github.com/vllm-project/vllm-gaudi/pull/616 Signed-off-by: Radoslaw Smyrek --- vllm_gaudi/v1/worker/hpu_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index c9488523e7..49fe2d4520 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -5668,12 +5668,12 @@ def process_metadata(self, attn_metadata: HPUAttentionMetadataV1, batch_size: in """ if attn_metadata.is_prompt: attn_metadata = self._set_attn_bias(attn_metadata, batch_size, seq_len, device, dtype) - if self.interleaved_sliding_window: + if self.interleaved_sliding_window and self.sliding_window is not None: attn_metadata = self._set_attn_bias_for_sliding_window(attn_metadata, batch_size, seq_len, self.sliding_window, device, dtype) else: attn_metadata = self._set_block_mapping(attn_metadata, batch_size, device, dtype) - if self.interleaved_sliding_window: + if self.interleaved_sliding_window and self.sliding_window is not None: attn_metadata = self._set_block_mapping(attn_metadata, batch_size, device, dtype, True) return attn_metadata From d4ef895234d35753f215cb86d13b48b97ff8414c Mon Sep 17 00:00:00 2001 From: Linoy Buchnik Date: Wed, 21 Jan 2026 15:07:55 +0200 Subject: [PATCH 2/4] [GAUDISW-245665] fix diverge from vllm in multiModalBudget (#837) Signed-off-by: linoy buchnik Signed-off-by: Iryna Boiko Co-authored-by: Iryna Boiko --- vllm_gaudi/v1/worker/hpu_model_runner.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 49fe2d4520..a4eae8c0a1 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -4620,8 +4620,7 @@ def warmup_multimodal_graphs(self, buckets): phase = 'Graph/Multimodal' from vllm.v1.worker.utils import MultiModalBudget self.mm_budget = MultiModalBudget( - self.model_config, - self.scheduler_config, + self.vllm_config, self.mm_registry, ) if self.supports_mm_inputs else None From 511fc131e2661f7ced3e739e64e7c4e473eb698a Mon Sep 17 00:00:00 2001 From: Artur Fierka Date: Mon, 26 Jan 2026 08:13:14 +0100 Subject: [PATCH 3/4] Fix Llama4 shape mismatch for 32k+ context window (#842) (#855) Llama4 for `max_model_len > 32k` enable temperature adjustment https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llama4.py#L719. Enabled adjustment causes tensor `q` shape modification from 2D to 3D: https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llama4.py#L307. This tensor is passing to `UnqnatizedFusedMoEMetod -> forward`: https://github.com/vllm-project/vllm-gaudi/blob/main/vllm_gaudi/ops/hpu_fused_moe.py#L163 causing invalid reshaping - we trying to return a 3D `output.view` based on 2D output tensor. Found that following PR introduced the bug: #680 and #684 Cherry-picked from `releases/v0.13.0` --------- Signed-off-by: Artur Fierka --- vllm_gaudi/ops/hpu_fused_moe.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm_gaudi/ops/hpu_fused_moe.py b/vllm_gaudi/ops/hpu_fused_moe.py index a27710fa74..4168d515ee 100644 --- a/vllm_gaudi/ops/hpu_fused_moe.py +++ b/vllm_gaudi/ops/hpu_fused_moe.py @@ -160,7 +160,10 @@ def forward_oot( permuted_weights=True, activation=layer.activation, ) - return output.view(*(output.size(0), *input_shape[1:])) + if layer.dp_size > 1: + return output.view(*(output.size(0), *input_shape[1:])) + else: + return output.view(*input_shape) def reduce_output(self, states: torch.Tensor) -> torch.Tensor: From 70d8e72b5ccadec08a970edd76649c2a77f16878 Mon Sep 17 00:00:00 2001 From: Radoslaw Smyrek Date: Thu, 22 Jan 2026 13:18:52 +0200 Subject: [PATCH 4/4] Flatten positions only when QK norm is enabled Signed-off-by: Radoslaw Smyrek --- vllm_gaudi/v1/worker/hpu_model_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index a4eae8c0a1..aaa6b9f4f7 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -370,7 +370,8 @@ def patch_llama4_get_attn_scale(model): orig = attn._get_attn_scale def _get_attn_scale_for_hpu(self, positions, _orig=orig): - positions = positions.flatten() + if self.qk_norm is not None: + positions = positions.flatten() return _orig(positions) attn._get_attn_scale = types.MethodType(_get_attn_scale_for_hpu, attn)