From 4f7f6150be30f4e3629a3286c7a2e11f0bff79a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rados=C5=82aw=20Smyrek?= <radoslawx.smyrek@intel.com>
Date: Wed, 21 Jan 2026 10:17:30 +0100
Subject: [PATCH 1/4] Interleaved sliding window fix (#805)

Following reasoning stated in PR:
https://github.com/vllm-project/vllm-gaudi/pull/616

Signed-off-by: Radoslaw Smyrek <radoslawx.smyrek@intel.com>
---
 vllm_gaudi/v1/worker/hpu_model_runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
index c9488523e7..49fe2d4520 100644
--- a/vllm_gaudi/v1/worker/hpu_model_runner.py
+++ b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -5668,12 +5668,12 @@ def process_metadata(self, attn_metadata: HPUAttentionMetadataV1, batch_size: in
         """
         if attn_metadata.is_prompt:
             attn_metadata = self._set_attn_bias(attn_metadata, batch_size, seq_len, device, dtype)
-            if self.interleaved_sliding_window:
+            if self.interleaved_sliding_window and self.sliding_window is not None:
                 attn_metadata = self._set_attn_bias_for_sliding_window(attn_metadata, batch_size, seq_len,
                                                                        self.sliding_window, device, dtype)
         else:
             attn_metadata = self._set_block_mapping(attn_metadata, batch_size, device, dtype)
-            if self.interleaved_sliding_window:
+            if self.interleaved_sliding_window and self.sliding_window is not None:
                 attn_metadata = self._set_block_mapping(attn_metadata, batch_size, device, dtype, True)
         return attn_metadata
 

From d4ef895234d35753f215cb86d13b48b97ff8414c Mon Sep 17 00:00:00 2001
From: Linoy Buchnik <linoybu@gmail.com>
Date: Wed, 21 Jan 2026 15:07:55 +0200
Subject: [PATCH 2/4] [GAUDISW-245665] fix diverge from vllm in
 multiModalBudget (#837)

Signed-off-by: linoy buchnik <lbuchnik@habana.ai>
Signed-off-by: Iryna Boiko <iboiko@habana.ai>
Co-authored-by: Iryna Boiko <iboiko@habana.ai>
---
 vllm_gaudi/v1/worker/hpu_model_runner.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
index 49fe2d4520..a4eae8c0a1 100644
--- a/vllm_gaudi/v1/worker/hpu_model_runner.py
+++ b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -4620,8 +4620,7 @@ def warmup_multimodal_graphs(self, buckets):
         phase = 'Graph/Multimodal'
         from vllm.v1.worker.utils import MultiModalBudget
         self.mm_budget = MultiModalBudget(
-            self.model_config,
-            self.scheduler_config,
+            self.vllm_config,
             self.mm_registry,
         ) if self.supports_mm_inputs else None
 

From 511fc131e2661f7ced3e739e64e7c4e473eb698a Mon Sep 17 00:00:00 2001
From: Artur Fierka <artur.fierka@intel.com>
Date: Mon, 26 Jan 2026 08:13:14 +0100
Subject: [PATCH 3/4] Fix Llama4 shape mismatch for 32k+ context window (#842)
 (#855)

Llama4 for `max_model_len > 32k` enable temperature adjustment
https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llama4.py#L719.
Enabled adjustment causes tensor `q` shape modification from 2D to 3D:
https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llama4.py#L307.
This tensor is passing to `UnqnatizedFusedMoEMetod -> forward`:
https://github.com/vllm-project/vllm-gaudi/blob/main/vllm_gaudi/ops/hpu_fused_moe.py#L163
causing invalid reshaping - we trying to return a 3D `output.view` based
on 2D output tensor.

Found that following PR introduced the bug: #680 and #684

Cherry-picked from `releases/v0.13.0`

---------

Signed-off-by: Artur Fierka <artur.fierka@intel.com>
---
 vllm_gaudi/ops/hpu_fused_moe.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm_gaudi/ops/hpu_fused_moe.py b/vllm_gaudi/ops/hpu_fused_moe.py
index a27710fa74..4168d515ee 100644
--- a/vllm_gaudi/ops/hpu_fused_moe.py
+++ b/vllm_gaudi/ops/hpu_fused_moe.py
@@ -160,7 +160,10 @@ def forward_oot(
             permuted_weights=True,
             activation=layer.activation,
         )
-        return output.view(*(output.size(0), *input_shape[1:]))
+        if layer.dp_size > 1:
+            return output.view(*(output.size(0), *input_shape[1:]))
+        else:
+            return output.view(*input_shape)
 
 
 def reduce_output(self, states: torch.Tensor) -> torch.Tensor:

From 70d8e72b5ccadec08a970edd76649c2a77f16878 Mon Sep 17 00:00:00 2001
From: Radoslaw Smyrek <radoslawx.smyrek@intel.com>
Date: Thu, 22 Jan 2026 13:18:52 +0200
Subject: [PATCH 4/4] Flatten positions only when QK norm is enabled

Signed-off-by: Radoslaw Smyrek <radoslawx.smyrek@intel.com>
---
 vllm_gaudi/v1/worker/hpu_model_runner.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
index a4eae8c0a1..aaa6b9f4f7 100644
--- a/vllm_gaudi/v1/worker/hpu_model_runner.py
+++ b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -370,7 +370,8 @@ def patch_llama4_get_attn_scale(model):
         orig = attn._get_attn_scale
 
         def _get_attn_scale_for_hpu(self, positions, _orig=orig):
-            positions = positions.flatten()
+            if self.qk_norm is not None:
+                positions = positions.flatten()
             return _orig(positions)
 
         attn._get_attn_scale = types.MethodType(_get_attn_scale_for_hpu, attn)