From 2c837d206bbe57bc92a33001a67913dd45b2e17e Mon Sep 17 00:00:00 2001
From: Jan Kaniecki <jkaniecki@habana.ai>
Date: Fri, 30 Jan 2026 17:44:11 +0200
Subject: [PATCH 1/3] Revert part of #758 to fix Llama4 Maverick

Signed-off-by: Jan Kaniecki <jan.kaniecki@intel.com>
---
 vllm_gaudi/v1/worker/hpu_model_runner.py | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
index ac50eb7712..9b03fa4420 100644
--- a/vllm_gaudi/v1/worker/hpu_model_runner.py
+++ b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -367,29 +367,6 @@ def is_mm_optimized(model):
         'Gemma3ForConditionalGeneration' in str(type(model))
 
 
-def patch_llama4_get_attn_scale(model):
-
-    config = getattr(model, "config", None)
-    is_llama4 = (getattr(config, "model_type", None) == "llama4") or ("llama4" in type(model).__name__.lower())
-    if not is_llama4:
-        return
-
-    for layer in model.language_model.model.layers:
-
-        if "Llama4Attention" not in type(layer.self_attn).__name__:
-            continue
-
-        attn = layer.self_attn
-        orig = attn._get_attn_scale
-
-        def _get_attn_scale_for_hpu(self, positions, _orig=orig):
-            if self.qk_norm is not None:
-                positions = positions.flatten()
-            return _orig(positions)
-
-        attn._get_attn_scale = types.MethodType(_get_attn_scale_for_hpu, attn)
-
-
 def maybe_set_chunked_attention_layers(model_runner):
     if hasattr(model_runner.model.config, 'text_config') and \
         hasattr(model_runner.model.config.text_config, 'attention_chunk_size') and \
@@ -429,7 +406,6 @@ def maybe_set_mamba_kv_cache_groups_ids(model, kv_cache_config: KVCacheConfig):
 def apply_model_specific_patches(model_runner):
     """The function applies model-specific monkey patches."""
     maybe_set_chunked_attention_layers(model_runner)
-    patch_llama4_get_attn_scale(model_runner.model)
 
 
 class HpuModelAdapter(torch.nn.Module, KVConnectorModelRunnerMixin):

From c4943dd709c993842825812c893ed553283a0930 Mon Sep 17 00:00:00 2001
From: Jan Kaniecki <jan.kaniecki@intel.com>
Date: Mon, 2 Feb 2026 08:25:41 +0100
Subject: [PATCH 2/3] Update hpu_model_runner.py

---
 vllm_gaudi/v1/worker/hpu_model_runner.py | 25 ++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
index 9b03fa4420..02228afcfc 100644
--- a/vllm_gaudi/v1/worker/hpu_model_runner.py
+++ b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -381,6 +381,30 @@ def maybe_set_chunked_attention_layers(model_runner):
             pass
 
 
+def patch_llama4_get_attn_scale(model):
+
+    config = getattr(model, "config", None)
+    is_llama4 = (getattr(config, "model_type", None) == "llama4") or ("llama4" in type(model).__name__.lower())
+    if not is_llama4:
+        return
+
+    for layer in model.language_model.model.layers:
+
+        if "Llama4Attention" not in type(layer.self_attn).__name__:
+            continue
+
+        attn = layer.self_attn
+
+        def _get_attn_scale_for_hpu(self, positions):
+            if self.qk_norm is not None:
+                positions = positions.flatten()
+            floor = torch.floor((positions + 1.0) / self.floor_scale)
+            attn_scale = torch.log(floor + 1.0) * self.attn_scale + 1.0
+
+            return attn_scale.unsqueeze(-1)
+
+        attn._get_attn_scale = types.MethodType(_get_attn_scale_for_hpu, attn)
+
 def maybe_set_mamba_kv_cache_groups_ids(model, kv_cache_config: KVCacheConfig):
     if isinstance(model, HpuModelAdapter):
         model = model.model
@@ -406,6 +430,7 @@ def maybe_set_mamba_kv_cache_groups_ids(model, kv_cache_config: KVCacheConfig):
 def apply_model_specific_patches(model_runner):
     """The function applies model-specific monkey patches."""
     maybe_set_chunked_attention_layers(model_runner)
+    patch_llama4_get_attn_scale(model_runner.model)
 
 
 class HpuModelAdapter(torch.nn.Module, KVConnectorModelRunnerMixin):

From 634e1f2090c28675312650a735f13cb8d0c072c9 Mon Sep 17 00:00:00 2001
From: Jan Kaniecki <jan.kaniecki@intel.com>
Date: Mon, 2 Feb 2026 08:26:57 +0100
Subject: [PATCH 3/3] Update hpu_model_runner.py

---
 vllm_gaudi/v1/worker/hpu_model_runner.py | 29 ++++++++++++------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
index 02228afcfc..d9bb1d2e51 100644
--- a/vllm_gaudi/v1/worker/hpu_model_runner.py
+++ b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -367,20 +367,6 @@ def is_mm_optimized(model):
         'Gemma3ForConditionalGeneration' in str(type(model))
 
 
-def maybe_set_chunked_attention_layers(model_runner):
-    if hasattr(model_runner.model.config, 'text_config') and \
-        hasattr(model_runner.model.config.text_config, 'attention_chunk_size') and \
-        model_runner.model.config.text_config.attention_chunk_size:
-        model_runner.model_has_chunked_attention = True
-        try:
-            for layer in model_runner.model.language_model.model.layers:
-                if "ChunkedLocalAttention" in layer.self_attn.attn.get_attn_backend().__name__:
-                    layer.self_attn.attn.impl.is_chunked_attention = True
-        except Exception:
-            # add explicit warning
-            pass
-
-
 def patch_llama4_get_attn_scale(model):
 
     config = getattr(model, "config", None)
@@ -405,6 +391,21 @@ def _get_attn_scale_for_hpu(self, positions):
 
         attn._get_attn_scale = types.MethodType(_get_attn_scale_for_hpu, attn)
 
+
+def maybe_set_chunked_attention_layers(model_runner):
+    if hasattr(model_runner.model.config, 'text_config') and \
+        hasattr(model_runner.model.config.text_config, 'attention_chunk_size') and \
+        model_runner.model.config.text_config.attention_chunk_size:
+        model_runner.model_has_chunked_attention = True
+        try:
+            for layer in model_runner.model.language_model.model.layers:
+                if "ChunkedLocalAttention" in layer.self_attn.attn.get_attn_backend().__name__:
+                    layer.self_attn.attn.impl.is_chunked_attention = True
+        except Exception:
+            # add explicit warning
+            pass
+
+
 def maybe_set_mamba_kv_cache_groups_ids(model, kv_cache_config: KVCacheConfig):
     if isinstance(model, HpuModelAdapter):
         model = model.model