From f3d6072202269b81d3f2317ba08af2da3cc3262d Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Thu, 12 Mar 2026 15:00:20 +0100 Subject: [PATCH 1/2] fix --- src/transformers/models/lfm2/modeling_lfm2.py | 10 +++++++++- src/transformers/models/lfm2/modular_lfm2.py | 10 +++++++++- src/transformers/models/lfm2_moe/modeling_lfm2_moe.py | 10 +++++++++- 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/lfm2/modeling_lfm2.py b/src/transformers/models/lfm2/modeling_lfm2.py index f9ae8deeb865..a7d7f3595d6b 100644 --- a/src/transformers/models/lfm2/modeling_lfm2.py +++ b/src/transformers/models/lfm2/modeling_lfm2.py @@ -459,14 +459,22 @@ def cuda_kernels_forward( past_key_values: Lfm2HybridConvCache | None = None, attention_mask: torch.Tensor | None = None, ): + seqlen = x.shape[1] x = apply_mask_to_padding_states(x, attention_mask) BCx = self.in_proj(x).transpose(-1, -2) B, C, x = BCx.chunk(3, dim=-2) Bx = B * x + # Note: we may or may not have to substract the current seq_len here as the cache may or may not be already updated + # by the current layer + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + # In this case, the cache was already updated and we need to subtract seq_len to get the correct past length + if "full_attention" in self.config.layer_types[: self.layer_idx]: + past_seen_tokens = past_seen_tokens - seqlen + conv_weights = self.conv.weight.view(self.conv.weight.size(0), self.conv.weight.size(2)) - if past_key_values is not None and past_key_values.get_seq_length() > 0: + if past_key_values is not None and past_seen_tokens > 0: conv_out = causal_conv1d_update( Bx.squeeze(-1), past_key_values.conv_cache[self.layer_idx], diff --git a/src/transformers/models/lfm2/modular_lfm2.py b/src/transformers/models/lfm2/modular_lfm2.py index eab687df803a..f25004fda454 100644 --- a/src/transformers/models/lfm2/modular_lfm2.py +++ b/src/transformers/models/lfm2/modular_lfm2.py @@ -294,14 +294,22 @@ def cuda_kernels_forward( past_key_values: Lfm2HybridConvCache | None = None, attention_mask: torch.Tensor | None = None, ): + seqlen = x.shape[1] x = apply_mask_to_padding_states(x, attention_mask) BCx = self.in_proj(x).transpose(-1, -2) B, C, x = BCx.chunk(3, dim=-2) Bx = B * x + # Note: we may or may not have to substract the current seq_len here as the cache may or may not be already updated + # by the current layer + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + # In this case, the cache was already updated and we need to subtract seq_len to get the correct past length + if "full_attention" in self.config.layer_types[: self.layer_idx]: + past_seen_tokens = past_seen_tokens - seqlen + conv_weights = self.conv.weight.view(self.conv.weight.size(0), self.conv.weight.size(2)) - if past_key_values is not None and past_key_values.get_seq_length() > 0: + if past_key_values is not None and past_seen_tokens > 0: conv_out = causal_conv1d_update( Bx.squeeze(-1), past_key_values.conv_cache[self.layer_idx], diff --git a/src/transformers/models/lfm2_moe/modeling_lfm2_moe.py b/src/transformers/models/lfm2_moe/modeling_lfm2_moe.py index d6b0401e4658..7721b699dba3 100644 --- a/src/transformers/models/lfm2_moe/modeling_lfm2_moe.py +++ b/src/transformers/models/lfm2_moe/modeling_lfm2_moe.py @@ -535,14 +535,22 @@ def cuda_kernels_forward( past_key_values: Lfm2MoeHybridConvCache | None = None, attention_mask: torch.Tensor | None = None, ): + seqlen = x.shape[1] x = apply_mask_to_padding_states(x, attention_mask) BCx = self.in_proj(x).transpose(-1, -2) B, C, x = BCx.chunk(3, dim=-2) Bx = B * x + # Note: we may or may not have to substract the current seq_len here as the cache may or may not be already updated + # by the current layer + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + # In this case, the cache was already updated and we need to subtract seq_len to get the correct past length + if "full_attention" in self.config.layer_types[: self.layer_idx]: + past_seen_tokens = past_seen_tokens - seqlen + conv_weights = self.conv.weight.view(self.conv.weight.size(0), self.conv.weight.size(2)) - if past_key_values is not None and past_key_values.get_seq_length() > 0: + if past_key_values is not None and past_seen_tokens > 0: conv_out = causal_conv1d_update( Bx.squeeze(-1), past_key_values.conv_cache[self.layer_idx], From ec8f0cce18850e26d87bb8b1d06217e452f260b9 Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Thu, 12 Mar 2026 15:28:11 +0100 Subject: [PATCH 2/2] nit --- src/transformers/models/lfm2/modeling_lfm2.py | 4 ++-- src/transformers/models/lfm2/modular_lfm2.py | 4 ++-- src/transformers/models/lfm2_moe/modeling_lfm2_moe.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/lfm2/modeling_lfm2.py b/src/transformers/models/lfm2/modeling_lfm2.py index a7d7f3595d6b..ef79261364ba 100644 --- a/src/transformers/models/lfm2/modeling_lfm2.py +++ b/src/transformers/models/lfm2/modeling_lfm2.py @@ -474,7 +474,7 @@ def cuda_kernels_forward( past_seen_tokens = past_seen_tokens - seqlen conv_weights = self.conv.weight.view(self.conv.weight.size(0), self.conv.weight.size(2)) - if past_key_values is not None and past_seen_tokens > 0: + if past_seen_tokens > 0: conv_out = causal_conv1d_update( Bx.squeeze(-1), past_key_values.conv_cache[self.layer_idx], @@ -515,7 +515,7 @@ def slow_forward( if "full_attention" in self.config.layer_types[: self.layer_idx]: past_seen_tokens = past_seen_tokens - seqlen - if past_key_values is not None and past_seen_tokens > 0: + if past_seen_tokens > 0: conv_state = past_key_values.conv_cache[self.layer_idx] cache_position = torch.arange(seqlen, device=conv_state.device) + past_seen_tokens cache_position = cache_position.clamp(0, self.L_cache - 1) diff --git a/src/transformers/models/lfm2/modular_lfm2.py b/src/transformers/models/lfm2/modular_lfm2.py index f25004fda454..65119a287abd 100644 --- a/src/transformers/models/lfm2/modular_lfm2.py +++ b/src/transformers/models/lfm2/modular_lfm2.py @@ -309,7 +309,7 @@ def cuda_kernels_forward( past_seen_tokens = past_seen_tokens - seqlen conv_weights = self.conv.weight.view(self.conv.weight.size(0), self.conv.weight.size(2)) - if past_key_values is not None and past_seen_tokens > 0: + if past_seen_tokens > 0: conv_out = causal_conv1d_update( Bx.squeeze(-1), past_key_values.conv_cache[self.layer_idx], @@ -350,7 +350,7 @@ def slow_forward( if "full_attention" in self.config.layer_types[: self.layer_idx]: past_seen_tokens = past_seen_tokens - seqlen - if past_key_values is not None and past_seen_tokens > 0: + if past_seen_tokens > 0: conv_state = past_key_values.conv_cache[self.layer_idx] cache_position = torch.arange(seqlen, device=conv_state.device) + past_seen_tokens cache_position = cache_position.clamp(0, self.L_cache - 1) diff --git a/src/transformers/models/lfm2_moe/modeling_lfm2_moe.py b/src/transformers/models/lfm2_moe/modeling_lfm2_moe.py index 7721b699dba3..03bdbbdc95f8 100644 --- a/src/transformers/models/lfm2_moe/modeling_lfm2_moe.py +++ b/src/transformers/models/lfm2_moe/modeling_lfm2_moe.py @@ -550,7 +550,7 @@ def cuda_kernels_forward( past_seen_tokens = past_seen_tokens - seqlen conv_weights = self.conv.weight.view(self.conv.weight.size(0), self.conv.weight.size(2)) - if past_key_values is not None and past_seen_tokens > 0: + if past_seen_tokens > 0: conv_out = causal_conv1d_update( Bx.squeeze(-1), past_key_values.conv_cache[self.layer_idx], @@ -591,7 +591,7 @@ def slow_forward( if "full_attention" in self.config.layer_types[: self.layer_idx]: past_seen_tokens = past_seen_tokens - seqlen - if past_key_values is not None and past_seen_tokens > 0: + if past_seen_tokens > 0: conv_state = past_key_values.conv_cache[self.layer_idx] cache_position = torch.arange(seqlen, device=conv_state.device) + past_seen_tokens cache_position = cache_position.clamp(0, self.L_cache - 1)