From 2c837d206bbe57bc92a33001a67913dd45b2e17e Mon Sep 17 00:00:00 2001 From: Jan Kaniecki Date: Fri, 30 Jan 2026 17:44:11 +0200 Subject: [PATCH 1/3] Revert part of #758 to fix Llama4 Maverick Signed-off-by: Jan Kaniecki --- vllm_gaudi/v1/worker/hpu_model_runner.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index ac50eb7712..9b03fa4420 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -367,29 +367,6 @@ def is_mm_optimized(model): 'Gemma3ForConditionalGeneration' in str(type(model)) -def patch_llama4_get_attn_scale(model): - - config = getattr(model, "config", None) - is_llama4 = (getattr(config, "model_type", None) == "llama4") or ("llama4" in type(model).__name__.lower()) - if not is_llama4: - return - - for layer in model.language_model.model.layers: - - if "Llama4Attention" not in type(layer.self_attn).__name__: - continue - - attn = layer.self_attn - orig = attn._get_attn_scale - - def _get_attn_scale_for_hpu(self, positions, _orig=orig): - if self.qk_norm is not None: - positions = positions.flatten() - return _orig(positions) - - attn._get_attn_scale = types.MethodType(_get_attn_scale_for_hpu, attn) - - def maybe_set_chunked_attention_layers(model_runner): if hasattr(model_runner.model.config, 'text_config') and \ hasattr(model_runner.model.config.text_config, 'attention_chunk_size') and \ @@ -429,7 +406,6 @@ def maybe_set_mamba_kv_cache_groups_ids(model, kv_cache_config: KVCacheConfig): def apply_model_specific_patches(model_runner): """The function applies model-specific monkey patches.""" maybe_set_chunked_attention_layers(model_runner) - patch_llama4_get_attn_scale(model_runner.model) class HpuModelAdapter(torch.nn.Module, KVConnectorModelRunnerMixin): From c4943dd709c993842825812c893ed553283a0930 Mon Sep 17 00:00:00 2001 From: Jan Kaniecki Date: Mon, 2 Feb 2026 08:25:41 +0100 Subject: [PATCH 2/3] Update hpu_model_runner.py --- vllm_gaudi/v1/worker/hpu_model_runner.py | 25 ++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 9b03fa4420..02228afcfc 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -381,6 +381,30 @@ def maybe_set_chunked_attention_layers(model_runner): pass +def patch_llama4_get_attn_scale(model): + + config = getattr(model, "config", None) + is_llama4 = (getattr(config, "model_type", None) == "llama4") or ("llama4" in type(model).__name__.lower()) + if not is_llama4: + return + + for layer in model.language_model.model.layers: + + if "Llama4Attention" not in type(layer.self_attn).__name__: + continue + + attn = layer.self_attn + + def _get_attn_scale_for_hpu(self, positions): + if self.qk_norm is not None: + positions = positions.flatten() + floor = torch.floor((positions + 1.0) / self.floor_scale) + attn_scale = torch.log(floor + 1.0) * self.attn_scale + 1.0 + + return attn_scale.unsqueeze(-1) + + attn._get_attn_scale = types.MethodType(_get_attn_scale_for_hpu, attn) + def maybe_set_mamba_kv_cache_groups_ids(model, kv_cache_config: KVCacheConfig): if isinstance(model, HpuModelAdapter): model = model.model @@ -406,6 +430,7 @@ def maybe_set_mamba_kv_cache_groups_ids(model, kv_cache_config: KVCacheConfig): def apply_model_specific_patches(model_runner): """The function applies model-specific monkey patches.""" maybe_set_chunked_attention_layers(model_runner) + patch_llama4_get_attn_scale(model_runner.model) class HpuModelAdapter(torch.nn.Module, KVConnectorModelRunnerMixin): From 634e1f2090c28675312650a735f13cb8d0c072c9 Mon Sep 17 00:00:00 2001 From: Jan Kaniecki Date: Mon, 2 Feb 2026 08:26:57 +0100 Subject: [PATCH 3/3] Update hpu_model_runner.py --- vllm_gaudi/v1/worker/hpu_model_runner.py | 29 ++++++++++++------------ 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 02228afcfc..d9bb1d2e51 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -367,20 +367,6 @@ def is_mm_optimized(model): 'Gemma3ForConditionalGeneration' in str(type(model)) -def maybe_set_chunked_attention_layers(model_runner): - if hasattr(model_runner.model.config, 'text_config') and \ - hasattr(model_runner.model.config.text_config, 'attention_chunk_size') and \ - model_runner.model.config.text_config.attention_chunk_size: - model_runner.model_has_chunked_attention = True - try: - for layer in model_runner.model.language_model.model.layers: - if "ChunkedLocalAttention" in layer.self_attn.attn.get_attn_backend().__name__: - layer.self_attn.attn.impl.is_chunked_attention = True - except Exception: - # add explicit warning - pass - - def patch_llama4_get_attn_scale(model): config = getattr(model, "config", None) @@ -405,6 +391,21 @@ def _get_attn_scale_for_hpu(self, positions): attn._get_attn_scale = types.MethodType(_get_attn_scale_for_hpu, attn) + +def maybe_set_chunked_attention_layers(model_runner): + if hasattr(model_runner.model.config, 'text_config') and \ + hasattr(model_runner.model.config.text_config, 'attention_chunk_size') and \ + model_runner.model.config.text_config.attention_chunk_size: + model_runner.model_has_chunked_attention = True + try: + for layer in model_runner.model.language_model.model.layers: + if "ChunkedLocalAttention" in layer.self_attn.attn.get_attn_backend().__name__: + layer.self_attn.attn.impl.is_chunked_attention = True + except Exception: + # add explicit warning + pass + + def maybe_set_mamba_kv_cache_groups_ids(model, kv_cache_config: KVCacheConfig): if isinstance(model, HpuModelAdapter): model = model.model