From 3f9d306441279439d3ba946e443cc542abd7bc69 Mon Sep 17 00:00:00 2001 From: Karol Brejna Date: Thu, 11 Sep 2025 15:42:04 +0200 Subject: [PATCH 1/2] Remove dead code (caused by ) --- .../models/deepseek_v2/modeling_deepseek_v2.py | 10 +++------- .../habana/transformers/models/gemma/modeling_gemma.py | 9 ++------- .../transformers/models/gemma2/modeling_gemma2.py | 8 ++------ .../habana/transformers/models/llama/modeling_llama.py | 8 ++------ .../transformers/models/mixtral/modeling_mixtral.py | 8 ++------ .../habana/transformers/models/qwen2/modeling_qwen2.py | 8 ++------ .../models/qwen2_moe/modeling_qwen2_moe.py | 10 +++------- .../habana/transformers/models/qwen3/modeling_qwen3.py | 8 ++------ .../models/qwen3_moe/modeling_qwen3_moe.py | 10 +++------- 9 files changed, 21 insertions(+), 58 deletions(-) diff --git a/optimum/habana/transformers/models/deepseek_v2/modeling_deepseek_v2.py b/optimum/habana/transformers/models/deepseek_v2/modeling_deepseek_v2.py index 4eeeeda2fc..57718b675f 100644 --- a/optimum/habana/transformers/models/deepseek_v2/modeling_deepseek_v2.py +++ b/optimum/habana/transformers/models/deepseek_v2/modeling_deepseek_v2.py @@ -1794,13 +1794,9 @@ def forward( else: past_seen_tokens = past_key_values[0][0][2] else: - if use_new_cache: - if not isinstance(past_key_values, StaticCache): - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - past_seen_tokens = past_key_values.get_seq_length() - else: - if past_key_values[0] is not None: ##added for (None, None) - past_seen_tokens = past_key_values[0][0].shape[2] + # HPU uses legacy cache path (use_new_cache = False) + if past_key_values[0] is not None: ##added for (None, None) + past_seen_tokens = past_key_values[0][0].shape[2] if ignore_cache_position is False: if cache_position is None: diff --git a/optimum/habana/transformers/models/gemma/modeling_gemma.py b/optimum/habana/transformers/models/gemma/modeling_gemma.py index ffc6941b9a..e9be7c56ea 100755 --- a/optimum/habana/transformers/models/gemma/modeling_gemma.py +++ b/optimum/habana/transformers/models/gemma/modeling_gemma.py @@ -623,13 +623,8 @@ def forward( if reuse_cache: past_seen_tokens = past_key_values[0][0][2] else: - if use_new_cache: - use_legacy_cache = not isinstance(past_key_values, Cache) - if use_legacy_cache: - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - past_seen_tokens = past_key_values.get_usable_length(seq_length) - else: - past_seen_tokens = past_key_values[0][0].shape[2] + # HPU uses legacy cache path (use_new_cache = False) + past_seen_tokens = past_key_values[0][0].shape[2] cache_position = None diff --git a/optimum/habana/transformers/models/gemma2/modeling_gemma2.py b/optimum/habana/transformers/models/gemma2/modeling_gemma2.py index 883feccf0e..0195093bc5 100755 --- a/optimum/habana/transformers/models/gemma2/modeling_gemma2.py +++ b/optimum/habana/transformers/models/gemma2/modeling_gemma2.py @@ -600,12 +600,8 @@ def forward( else: past_seen_tokens = past_key_values[0][0][2] else: - if use_new_cache: - if not isinstance(past_key_values, StaticCache): - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - past_seen_tokens = past_key_values.get_seq_length() - else: - past_seen_tokens = past_key_values[0][0].shape[2] + # HPU uses legacy cache path (use_new_cache = False) + past_seen_tokens = past_key_values[0][0].shape[2] if ignore_cache_position is False: if cache_position is None: diff --git a/optimum/habana/transformers/models/llama/modeling_llama.py b/optimum/habana/transformers/models/llama/modeling_llama.py index c31460f74c..3b2e1a99be 100755 --- a/optimum/habana/transformers/models/llama/modeling_llama.py +++ b/optimum/habana/transformers/models/llama/modeling_llama.py @@ -1247,12 +1247,8 @@ def forward( else: past_seen_tokens = past_key_values[0][0][2] else: - if use_new_cache: - if not isinstance(past_key_values, StaticCache): - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - past_seen_tokens = past_key_values.get_seq_length() - else: - past_seen_tokens = past_key_values[0][0].shape[2] + # HPU uses legacy cache path (use_new_cache = False) + past_seen_tokens = past_key_values[0][0].shape[2] if ignore_cache_position is False: if cache_position is None: diff --git a/optimum/habana/transformers/models/mixtral/modeling_mixtral.py b/optimum/habana/transformers/models/mixtral/modeling_mixtral.py index c05dcde03d..bc5c995649 100644 --- a/optimum/habana/transformers/models/mixtral/modeling_mixtral.py +++ b/optimum/habana/transformers/models/mixtral/modeling_mixtral.py @@ -591,12 +591,8 @@ def forward( if reuse_cache: past_key_values_length = past_key_values[0][0][2] else: - if use_new_cache: - if not isinstance(past_key_values, StaticCache): - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - past_key_values_length = past_key_values.get_usable_length() - else: - past_key_values_length = past_key_values[0][0].shape[2] + # HPU uses legacy cache path (use_new_cache = False) + past_key_values_length = past_key_values[0][0].shape[2] if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) diff --git a/optimum/habana/transformers/models/qwen2/modeling_qwen2.py b/optimum/habana/transformers/models/qwen2/modeling_qwen2.py index 5057b14665..f96b571f31 100644 --- a/optimum/habana/transformers/models/qwen2/modeling_qwen2.py +++ b/optimum/habana/transformers/models/qwen2/modeling_qwen2.py @@ -809,12 +809,8 @@ def forward( else: past_seen_tokens = past_key_values[0][0][2] else: - if use_new_cache: - if not isinstance(past_key_values, StaticCache): - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - past_seen_tokens = past_key_values.get_seq_length() - else: - past_seen_tokens = past_key_values[0][0].shape[2] + # HPU uses legacy cache path (use_new_cache = False) + past_seen_tokens = past_key_values[0][0].shape[2] if ignore_cache_position is False: if cache_position is None: diff --git a/optimum/habana/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/optimum/habana/transformers/models/qwen2_moe/modeling_qwen2_moe.py index 5b477853af..148a8e9c0c 100755 --- a/optimum/habana/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/optimum/habana/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -869,13 +869,9 @@ def forward( else: past_seen_tokens = past_key_values[0][0][2] else: - if use_new_cache: - if not isinstance(past_key_values, StaticCache): - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - past_seen_tokens = past_key_values.get_seq_length() - else: - if past_key_values[0] is not None: ##added for (None, None) - past_seen_tokens = past_key_values[0][0].shape[2] + # HPU uses legacy cache path (use_new_cache = False) + if past_key_values[0] is not None: ##added for (None, None) + past_seen_tokens = past_key_values[0][0].shape[2] if ignore_cache_position is False: if cache_position is None: diff --git a/optimum/habana/transformers/models/qwen3/modeling_qwen3.py b/optimum/habana/transformers/models/qwen3/modeling_qwen3.py index f4b56e8f78..6364e764be 100644 --- a/optimum/habana/transformers/models/qwen3/modeling_qwen3.py +++ b/optimum/habana/transformers/models/qwen3/modeling_qwen3.py @@ -804,12 +804,8 @@ def forward( else: past_seen_tokens = past_key_values[0][0][2] else: - if use_new_cache: - if not isinstance(past_key_values, StaticCache): - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - past_seen_tokens = past_key_values.get_seq_length() - else: - past_seen_tokens = past_key_values[0][0].shape[2] + # HPU uses legacy cache path (use_new_cache = False) + past_seen_tokens = past_key_values[0][0].shape[2] if ignore_cache_position is False: if cache_position is None: diff --git a/optimum/habana/transformers/models/qwen3_moe/modeling_qwen3_moe.py b/optimum/habana/transformers/models/qwen3_moe/modeling_qwen3_moe.py index e0e24e2a3e..8c87c7d020 100644 --- a/optimum/habana/transformers/models/qwen3_moe/modeling_qwen3_moe.py +++ b/optimum/habana/transformers/models/qwen3_moe/modeling_qwen3_moe.py @@ -906,13 +906,9 @@ def forward( else: past_seen_tokens = past_key_values[0][0][2] else: - if use_new_cache: - if not isinstance(past_key_values, StaticCache): - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - past_seen_tokens = past_key_values.get_seq_length() - else: - if past_key_values[0] is not None: ##added for (None, None) - past_seen_tokens = past_key_values[0][0].shape[2] + # HPU uses legacy cache path (use_new_cache = False) + if past_key_values[0] is not None: ##added for (None, None) + past_seen_tokens = past_key_values[0][0].shape[2] if ignore_cache_position is False: if cache_position is None: From 93e6a7f5580c5748cb963403204060f7be228d84 Mon Sep 17 00:00:00 2001 From: Karol Brejna Date: Thu, 11 Sep 2025 15:54:13 +0200 Subject: [PATCH 2/2] Fix formatting --- .../transformers/models/deepseek_v2/modeling_deepseek_v2.py | 2 +- optimum/habana/transformers/models/gemma/modeling_gemma.py | 2 +- optimum/habana/transformers/models/gemma2/modeling_gemma2.py | 2 +- optimum/habana/transformers/models/llama/modeling_llama.py | 2 +- optimum/habana/transformers/models/mixtral/modeling_mixtral.py | 2 +- optimum/habana/transformers/models/qwen2/modeling_qwen2.py | 2 +- .../habana/transformers/models/qwen2_moe/modeling_qwen2_moe.py | 2 +- optimum/habana/transformers/models/qwen3/modeling_qwen3.py | 2 +- .../habana/transformers/models/qwen3_moe/modeling_qwen3_moe.py | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/optimum/habana/transformers/models/deepseek_v2/modeling_deepseek_v2.py b/optimum/habana/transformers/models/deepseek_v2/modeling_deepseek_v2.py index 57718b675f..fd7c84fc82 100644 --- a/optimum/habana/transformers/models/deepseek_v2/modeling_deepseek_v2.py +++ b/optimum/habana/transformers/models/deepseek_v2/modeling_deepseek_v2.py @@ -36,7 +36,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from transformers import PretrainedConfig from transformers.activations import ACT2FN -from transformers.cache_utils import Cache, DynamicCache, StaticCache +from transformers.cache_utils import Cache, StaticCache from transformers.generation import GenerationMixin from transformers.integrations.deepspeed import is_deepspeed_available from transformers.modeling_attn_mask_utils import ( diff --git a/optimum/habana/transformers/models/gemma/modeling_gemma.py b/optimum/habana/transformers/models/gemma/modeling_gemma.py index e9be7c56ea..b853090c92 100755 --- a/optimum/habana/transformers/models/gemma/modeling_gemma.py +++ b/optimum/habana/transformers/models/gemma/modeling_gemma.py @@ -25,7 +25,7 @@ import torch import torch.nn.functional as F -from transformers.cache_utils import Cache, DynamicCache +from transformers.cache_utils import Cache from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast from transformers.models.gemma.modeling_gemma import ( GemmaAttention, diff --git a/optimum/habana/transformers/models/gemma2/modeling_gemma2.py b/optimum/habana/transformers/models/gemma2/modeling_gemma2.py index 0195093bc5..0088fa6eb7 100755 --- a/optimum/habana/transformers/models/gemma2/modeling_gemma2.py +++ b/optimum/habana/transformers/models/gemma2/modeling_gemma2.py @@ -20,7 +20,7 @@ import torch import torch.nn.functional as F -from transformers.cache_utils import Cache, DynamicCache, StaticCache +from transformers.cache_utils import Cache from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast from transformers.models.gemma2.modeling_gemma2 import ( Gemma2Attention, diff --git a/optimum/habana/transformers/models/llama/modeling_llama.py b/optimum/habana/transformers/models/llama/modeling_llama.py index 3b2e1a99be..02efc16904 100755 --- a/optimum/habana/transformers/models/llama/modeling_llama.py +++ b/optimum/habana/transformers/models/llama/modeling_llama.py @@ -4,7 +4,7 @@ import torch from torch.distributed.distributed_c10d import ProcessGroup from transformers.activations import ACT2FN -from transformers.cache_utils import Cache, DynamicCache, StaticCache +from transformers.cache_utils import Cache from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS from transformers.models.llama.modeling_llama import ( diff --git a/optimum/habana/transformers/models/mixtral/modeling_mixtral.py b/optimum/habana/transformers/models/mixtral/modeling_mixtral.py index bc5c995649..863731a551 100644 --- a/optimum/habana/transformers/models/mixtral/modeling_mixtral.py +++ b/optimum/habana/transformers/models/mixtral/modeling_mixtral.py @@ -28,7 +28,7 @@ import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F -from transformers.cache_utils import Cache, DynamicCache, StaticCache +from transformers.cache_utils import Cache from transformers.integrations.deepspeed import is_deepspeed_available from transformers.modeling_attn_mask_utils import ( _prepare_4d_causal_attention_mask, diff --git a/optimum/habana/transformers/models/qwen2/modeling_qwen2.py b/optimum/habana/transformers/models/qwen2/modeling_qwen2.py index f96b571f31..3d9fc24b2c 100644 --- a/optimum/habana/transformers/models/qwen2/modeling_qwen2.py +++ b/optimum/habana/transformers/models/qwen2/modeling_qwen2.py @@ -19,7 +19,7 @@ from typing import Optional, Union import torch -from transformers.cache_utils import Cache, DynamicCache, StaticCache +from transformers.cache_utils import Cache from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask from transformers.modeling_outputs import ( BaseModelOutputWithPast, diff --git a/optimum/habana/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/optimum/habana/transformers/models/qwen2_moe/modeling_qwen2_moe.py index 148a8e9c0c..8dd2653a66 100755 --- a/optimum/habana/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/optimum/habana/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -26,7 +26,7 @@ import habana_frameworks.torch.core as htcore import torch import torch.nn.functional as F -from transformers.cache_utils import Cache, DynamicCache, StaticCache +from transformers.cache_utils import Cache, StaticCache from transformers.integrations.deepspeed import is_deepspeed_available from transformers.modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig diff --git a/optimum/habana/transformers/models/qwen3/modeling_qwen3.py b/optimum/habana/transformers/models/qwen3/modeling_qwen3.py index 6364e764be..f6932c8c54 100644 --- a/optimum/habana/transformers/models/qwen3/modeling_qwen3.py +++ b/optimum/habana/transformers/models/qwen3/modeling_qwen3.py @@ -19,7 +19,7 @@ from typing import Optional, Union import torch -from transformers.cache_utils import Cache, DynamicCache, StaticCache +from transformers.cache_utils import Cache from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask from transformers.modeling_outputs import ( BaseModelOutputWithPast, diff --git a/optimum/habana/transformers/models/qwen3_moe/modeling_qwen3_moe.py b/optimum/habana/transformers/models/qwen3_moe/modeling_qwen3_moe.py index 8c87c7d020..45d55503f1 100644 --- a/optimum/habana/transformers/models/qwen3_moe/modeling_qwen3_moe.py +++ b/optimum/habana/transformers/models/qwen3_moe/modeling_qwen3_moe.py @@ -25,7 +25,7 @@ import torch import torch.nn.functional as F from torch import nn -from transformers.cache_utils import Cache, DynamicCache, StaticCache +from transformers.cache_utils import Cache, StaticCache from transformers.integrations.deepspeed import is_deepspeed_available from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask from transformers.modeling_outputs import (