diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py index ebbdf95e66ee..c700c3204904 100644 --- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py @@ -117,10 +117,6 @@ def eager_attention_forward( # seem a bit unusual, but is taken from the original Transformer paper. attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) - # Mask heads if we want to - if attention_mask is not None: - attn_weights = attn_weights * attention_mask - attn_output = torch.matmul(attn_weights, value) attn_output = attn_output.transpose(1, 2).contiguous() diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py index f356dac8e988..329ef32d48c4 100644 --- a/src/transformers/models/deit/modeling_deit.py +++ b/src/transformers/models/deit/modeling_deit.py @@ -182,10 +182,6 @@ def eager_attention_forward( # seem a bit unusual, but is taken from the original Transformer paper. attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) - # Mask heads if we want to - if attention_mask is not None: - attn_weights = attn_weights * attention_mask - attn_output = torch.matmul(attn_weights, value) attn_output = attn_output.transpose(1, 2).contiguous() diff --git a/src/transformers/models/dinov2/modeling_dinov2.py b/src/transformers/models/dinov2/modeling_dinov2.py index 12381a81bee8..b0e74df05d19 100644 --- a/src/transformers/models/dinov2/modeling_dinov2.py +++ b/src/transformers/models/dinov2/modeling_dinov2.py @@ -170,10 +170,6 @@ def eager_attention_forward( # seem a bit unusual, but is taken from the original Transformer paper. attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) - # Mask heads if we want to - if attention_mask is not None: - attn_weights = attn_weights * attention_mask - attn_output = torch.matmul(attn_weights, value) attn_output = attn_output.transpose(1, 2).contiguous() diff --git a/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py b/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py index 34cf7715ab57..65e219fcfdd4 100644 --- a/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +++ b/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py @@ -190,10 +190,6 @@ def eager_attention_forward( # seem a bit unusual, but is taken from the original Transformer paper. attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) - # Mask heads if we want to - if attention_mask is not None: - attn_weights = attn_weights * attention_mask - attn_output = torch.matmul(attn_weights, value) attn_output = attn_output.transpose(1, 2).contiguous() diff --git a/src/transformers/models/dinov3_vit/modeling_dinov3_vit.py b/src/transformers/models/dinov3_vit/modeling_dinov3_vit.py index 1aa290e7eafb..1fa962dd807d 100644 --- a/src/transformers/models/dinov3_vit/modeling_dinov3_vit.py +++ b/src/transformers/models/dinov3_vit/modeling_dinov3_vit.py @@ -208,10 +208,6 @@ def eager_attention_forward( # seem a bit unusual, but is taken from the original Transformer paper. attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) - # Mask heads if we want to - if attention_mask is not None: - attn_weights = attn_weights * attention_mask - attn_output = torch.matmul(attn_weights, value) attn_output = attn_output.transpose(1, 2).contiguous() diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py index 8bfedd2ef436..1f8bb5d86ac9 100755 --- a/src/transformers/models/dpt/modeling_dpt.py +++ b/src/transformers/models/dpt/modeling_dpt.py @@ -288,10 +288,6 @@ def eager_attention_forward( # seem a bit unusual, but is taken from the original Transformer paper. attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) - # Mask heads if we want to - if attention_mask is not None: - attn_weights = attn_weights * attention_mask - attn_output = torch.matmul(attn_weights, value) attn_output = attn_output.transpose(1, 2).contiguous() diff --git a/src/transformers/models/ijepa/modeling_ijepa.py b/src/transformers/models/ijepa/modeling_ijepa.py index 1a6f23aedd77..10c7803ce10f 100644 --- a/src/transformers/models/ijepa/modeling_ijepa.py +++ b/src/transformers/models/ijepa/modeling_ijepa.py @@ -161,10 +161,6 @@ def eager_attention_forward( # seem a bit unusual, but is taken from the original Transformer paper. attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) - # Mask heads if we want to - if attention_mask is not None: - attn_weights = attn_weights * attention_mask - attn_output = torch.matmul(attn_weights, value) attn_output = attn_output.transpose(1, 2).contiguous() diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py index fab1c5b5f7e0..d0193387c589 100755 --- a/src/transformers/models/videomae/modeling_videomae.py +++ b/src/transformers/models/videomae/modeling_videomae.py @@ -199,10 +199,6 @@ def eager_attention_forward( # seem a bit unusual, but is taken from the original Transformer paper. attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) - # Mask heads if we want to - if attention_mask is not None: - attn_weights = attn_weights * attention_mask - attn_output = torch.matmul(attn_weights, value) attn_output = attn_output.transpose(1, 2).contiguous() diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py index 869c67a1bb35..7c3eb386ee3b 100644 --- a/src/transformers/models/vit/modeling_vit.py +++ b/src/transformers/models/vit/modeling_vit.py @@ -187,10 +187,6 @@ def eager_attention_forward( # seem a bit unusual, but is taken from the original Transformer paper. attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) - # Mask heads if we want to - if attention_mask is not None: - attn_weights = attn_weights * attention_mask - attn_output = torch.matmul(attn_weights, value) attn_output = attn_output.transpose(1, 2).contiguous() diff --git a/src/transformers/models/vit_mae/modeling_vit_mae.py b/src/transformers/models/vit_mae/modeling_vit_mae.py index 09d9acd3a6b2..62a6f302afd3 100755 --- a/src/transformers/models/vit_mae/modeling_vit_mae.py +++ b/src/transformers/models/vit_mae/modeling_vit_mae.py @@ -347,10 +347,6 @@ def eager_attention_forward( # seem a bit unusual, but is taken from the original Transformer paper. attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) - # Mask heads if we want to - if attention_mask is not None: - attn_weights = attn_weights * attention_mask - attn_output = torch.matmul(attn_weights, value) attn_output = attn_output.transpose(1, 2).contiguous() diff --git a/src/transformers/models/vit_msn/modeling_vit_msn.py b/src/transformers/models/vit_msn/modeling_vit_msn.py index 1a8e7e00c2b5..a14e72ec6c0b 100644 --- a/src/transformers/models/vit_msn/modeling_vit_msn.py +++ b/src/transformers/models/vit_msn/modeling_vit_msn.py @@ -184,10 +184,6 @@ def eager_attention_forward( # seem a bit unusual, but is taken from the original Transformer paper. attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) - # Mask heads if we want to - if attention_mask is not None: - attn_weights = attn_weights * attention_mask - attn_output = torch.matmul(attn_weights, value) attn_output = attn_output.transpose(1, 2).contiguous() diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py index a97ca8115da6..f7baf2cc8dad 100644 --- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py @@ -116,10 +116,6 @@ def eager_attention_forward( # seem a bit unusual, but is taken from the original Transformer paper. attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) - # Mask heads if we want to - if attention_mask is not None: - attn_weights = attn_weights * attention_mask - attn_output = torch.matmul(attn_weights, value) attn_output = attn_output.transpose(1, 2).contiguous() diff --git a/src/transformers/models/vivit/modeling_vivit.py b/src/transformers/models/vivit/modeling_vivit.py index 776130342a1b..8bb155e34f1a 100755 --- a/src/transformers/models/vivit/modeling_vivit.py +++ b/src/transformers/models/vivit/modeling_vivit.py @@ -177,10 +177,6 @@ def eager_attention_forward( # seem a bit unusual, but is taken from the original Transformer paper. attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) - # Mask heads if we want to - if attention_mask is not None: - attn_weights = attn_weights * attention_mask - attn_output = torch.matmul(attn_weights, value) attn_output = attn_output.transpose(1, 2).contiguous() diff --git a/src/transformers/models/vjepa2/modeling_vjepa2.py b/src/transformers/models/vjepa2/modeling_vjepa2.py index bba21bb1c3fe..268376252928 100644 --- a/src/transformers/models/vjepa2/modeling_vjepa2.py +++ b/src/transformers/models/vjepa2/modeling_vjepa2.py @@ -169,10 +169,6 @@ def eager_attention_forward( # seem a bit unusual, but is taken from the original Transformer paper. attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) - # Mask heads if we want to - if attention_mask is not None: - attn_weights = attn_weights * attention_mask - attn_output = torch.matmul(attn_weights, value) attn_output = attn_output.transpose(1, 2).contiguous() diff --git a/src/transformers/models/yolos/modeling_yolos.py b/src/transformers/models/yolos/modeling_yolos.py index fca6232afe26..55c896aefb7e 100755 --- a/src/transformers/models/yolos/modeling_yolos.py +++ b/src/transformers/models/yolos/modeling_yolos.py @@ -232,10 +232,6 @@ def eager_attention_forward( # seem a bit unusual, but is taken from the original Transformer paper. attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) - # Mask heads if we want to - if attention_mask is not None: - attn_weights = attn_weights * attention_mask - attn_output = torch.matmul(attn_weights, value) attn_output = attn_output.transpose(1, 2).contiguous()