diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index 7301f434f7fb..9f9dcd5e4cd9 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -46,7 +46,6 @@ add_start_docstrings_to_model_forward, is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, - is_torchdynamo_compiling, logging, replace_return_docstrings, ) @@ -1169,13 +1168,8 @@ def forward( ) hidden_states = outputs[0] - if labels is None and not is_torchdynamo_compiling(): - logger.warning_once( - "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)" - ) # Only compute necessary logits, and do not upcast them to float if we are not computing the loss - # TODO: remove the float() operation in v4.46 - logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float() + logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) logits = logits * self.logit_scale loss = None diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index c6070a3d96b6..0d491b76cae7 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -43,7 +43,6 @@ add_start_docstrings, add_start_docstrings_to_model_forward, is_flash_attn_greater_or_equal_2_10, - is_torchdynamo_compiling, logging, replace_return_docstrings, ) @@ -1081,13 +1080,8 @@ def forward( ) hidden_states = outputs[0] - if labels is None and not is_torchdynamo_compiling(): - logger.warning_once( - "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)" - ) # Only compute necessary logits, and do not upcast them to float if we are not computing the loss - # TODO: remove the float() operation in v4.46 - logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float() + logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) loss = None if labels is not None: diff --git a/src/transformers/models/gemma/modular_gemma.py b/src/transformers/models/gemma/modular_gemma.py index ca89b6cf2a6d..f3ac0854313d 100644 --- a/src/transformers/models/gemma/modular_gemma.py +++ b/src/transformers/models/gemma/modular_gemma.py @@ -27,7 +27,7 @@ from ...modeling_flash_attention_utils import _flash_attention_forward from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast from ...pytorch_utils import ALL_LAYERNORM_LAYERS -from ...utils import is_torchdynamo_compiling, logging +from ...utils import logging from ..llama.modeling_llama import ( LlamaDecoderLayer, LlamaFlashAttention2, @@ -833,13 +833,8 @@ def forward( ) hidden_states = outputs[0] - if labels is None and not is_torchdynamo_compiling(): - logger.warning_once( - "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)" - ) # Only compute necessary logits, and do not upcast them to float if we are not computing the loss - # TODO: remove the float() operation in v4.46 - logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float() + logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) loss = None if labels is not None: diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py index c52b7b82e13d..6080e541763c 100644 --- a/src/transformers/models/gemma2/modeling_gemma2.py +++ b/src/transformers/models/gemma2/modeling_gemma2.py @@ -42,7 +42,6 @@ add_start_docstrings_to_model_forward, is_flash_attn_greater_or_equal, is_flash_attn_greater_or_equal_2_10, - is_torchdynamo_compiling, logging, replace_return_docstrings, ) @@ -1058,10 +1057,6 @@ def forward( ) hidden_states = outputs[0] - if labels is None and not is_torchdynamo_compiling(): - logger.warning_once( - "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)" - ) # Only compute necessary logits, and do not upcast them to float if we are not computing the loss logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) if self.config.final_logit_softcapping is not None: @@ -1069,8 +1064,6 @@ def forward( logits = torch.tanh(logits) logits = logits * self.config.final_logit_softcapping - # TODO: remove the float() operation in v4.46 - logits = logits.float() loss = None if labels is not None: # Upcast to float if we need to compute the loss to avoid potential precision issues diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py index ff53955716e6..f919731fc147 100644 --- a/src/transformers/models/gemma2/modular_gemma2.py +++ b/src/transformers/models/gemma2/modular_gemma2.py @@ -31,7 +31,6 @@ is_flash_attn_2_available, is_flash_attn_greater_or_equal, is_flash_attn_greater_or_equal_2_10, - is_torchdynamo_compiling, logging, ) from ..gemma.modeling_gemma import ( @@ -800,10 +799,6 @@ def forward( ) hidden_states = outputs[0] - if labels is None and not is_torchdynamo_compiling(): - logger.warning_once( - "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)" - ) # Only compute necessary logits, and do not upcast them to float if we are not computing the loss logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) if self.config.final_logit_softcapping is not None: @@ -811,8 +806,6 @@ def forward( logits = torch.tanh(logits) logits = logits * self.config.final_logit_softcapping - # TODO: remove the float() operation in v4.46 - logits = logits.float() loss = None if labels is not None: # Upcast to float if we need to compute the loss to avoid potential precision issues diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py index 056811138155..3af34b7dd69b 100644 --- a/src/transformers/models/idefics2/modeling_idefics2.py +++ b/src/transformers/models/idefics2/modeling_idefics2.py @@ -34,7 +34,6 @@ add_start_docstrings_to_model_forward, is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, - is_torchdynamo_compiling, logging, replace_return_docstrings, ) @@ -1617,13 +1616,8 @@ def forward( ) hidden_states = outputs[0] - if labels is None and not is_torchdynamo_compiling(): - logger.warning_once( - "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)" - ) # Only compute necessary logits, and do not upcast them to float if we are not computing the loss - # TODO: remove the float() operation in v4.46 - logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float() + logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) loss = None if labels is not None: diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py index 07f84b362eee..54ea73ee850a 100755 --- a/src/transformers/models/jamba/modeling_jamba.py +++ b/src/transformers/models/jamba/modeling_jamba.py @@ -51,7 +51,6 @@ is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, is_mamba_ssm_available, - is_torchdynamo_compiling, ) from .configuration_jamba import JambaConfig @@ -1540,12 +1539,6 @@ def forward( logits = self.lm_head(hidden_states) else: logits = self.lm_head(hidden_states[..., -num_logits_to_keep:, :]) - if labels is None and not is_torchdynamo_compiling: - logger.warning_once( - "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)" - ) - # TODO: remove the float() operations in v4.46 - logits = logits.float() loss = None if labels is not None: diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py index 8b39183b8fc6..e38993668862 100644 --- a/src/transformers/models/jetmoe/modeling_jetmoe.py +++ b/src/transformers/models/jetmoe/modeling_jetmoe.py @@ -38,7 +38,6 @@ add_start_docstrings_to_model_forward, is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, - is_torchdynamo_compiling, logging, replace_return_docstrings, ) @@ -1302,13 +1301,8 @@ def forward( ) hidden_states = outputs[0] - if labels is None and not is_torchdynamo_compiling(): - logger.warning_once( - "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)" - ) # Only compute necessary logits, and do not upcast them to float if we are not computing the loss - # TODO: remove the float() operation in v4.46 - logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float() + logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) loss = None if labels is not None: diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 99edee6a92a8..7a8f2c82c646 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -45,7 +45,6 @@ add_start_docstrings, add_start_docstrings_to_model_forward, is_flash_attn_greater_or_equal_2_10, - is_torchdynamo_compiling, logging, replace_return_docstrings, ) @@ -1206,13 +1205,8 @@ def forward( logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)] logits = torch.cat(logits, dim=-1) else: - if labels is None and not is_torchdynamo_compiling(): - logger.warning_once( - "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)" - ) # Only compute necessary logits, and do not upcast them to float if we are not computing the loss - # TODO: remove the float() operation in v4.46 - logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float() + logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) loss = None if labels is not None: diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index ffa1a18307e9..a099c60c5ba9 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -43,7 +43,6 @@ add_start_docstrings_to_model_forward, is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, - is_torchdynamo_compiling, logging, replace_return_docstrings, ) @@ -1050,13 +1049,8 @@ def forward( ) hidden_states = outputs[0] - if labels is None and not is_torchdynamo_compiling(): - logger.warning_once( - "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)" - ) # Only compute necessary logits, and do not upcast them to float if we are not computing the loss - # TODO: remove the float() operation in v4.46 - logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float() + logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) loss = None if labels is not None: diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index e87054cd70f5..dd060a55f0fc 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -44,7 +44,6 @@ add_start_docstrings, add_start_docstrings_to_model_forward, is_flash_attn_2_available, - is_torchdynamo_compiling, logging, replace_return_docstrings, ) @@ -1302,13 +1301,8 @@ def forward( ) hidden_states = outputs[0] - if labels is None and not is_torchdynamo_compiling(): - logger.warning_once( - "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)" - ) # Only compute necessary logits, and do not upcast them to float if we are not computing the loss - # TODO: remove the float() operation in v4.46 - logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float() + logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) loss = None if labels is not None: diff --git a/src/transformers/models/nemotron/modeling_nemotron.py b/src/transformers/models/nemotron/modeling_nemotron.py index 9411f0bcae5a..c3cccd56108e 100644 --- a/src/transformers/models/nemotron/modeling_nemotron.py +++ b/src/transformers/models/nemotron/modeling_nemotron.py @@ -43,7 +43,6 @@ add_start_docstrings, add_start_docstrings_to_model_forward, is_flash_attn_greater_or_equal_2_10, - is_torchdynamo_compiling, logging, replace_return_docstrings, ) @@ -1081,14 +1080,8 @@ def forward( ) hidden_states = outputs[0] - if labels is None and not is_torchdynamo_compiling(): - logger.warning_once( - "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)" - ) # Only compute necessary logits, and do not upcast them to float if we are not computing the loss logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) - # TODO: remove the float() operation in v4.46 - logits = logits.float() loss = None if labels is not None: diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py index 668722fc9e3f..894adeb44ef1 100644 --- a/src/transformers/models/olmo/modeling_olmo.py +++ b/src/transformers/models/olmo/modeling_olmo.py @@ -43,7 +43,6 @@ add_start_docstrings_to_model_forward, is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, - is_torchdynamo_compiling, logging, replace_return_docstrings, ) @@ -1124,13 +1123,8 @@ def forward( ) hidden_states = outputs[0] - if labels is None and not is_torchdynamo_compiling(): - logger.warning_once( - "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)" - ) # Only compute necessary logits, and do not upcast them to float if we are not computing the loss - # TODO: remove the float() operation in v4.46 - logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float() + logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) loss = None if labels is not None: diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py index cb59bd0df9a1..1ae4a5c5c1db 100644 --- a/src/transformers/models/phi/modeling_phi.py +++ b/src/transformers/models/phi/modeling_phi.py @@ -43,7 +43,6 @@ get_torch_version, is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, - is_torchdynamo_compiling, logging, replace_return_docstrings, ) @@ -1247,13 +1246,8 @@ def forward( ) hidden_states = outputs[0] - if labels is None and not is_torchdynamo_compiling(): - logger.warning_once( - "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)" - ) # Only compute necessary logits, and do not upcast them to float if we are not computing the loss - # TODO: remove the float() operation in v4.46 - logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float() + logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) loss = None if labels is not None: diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index 1c1bb34171b6..20d023311548 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -41,7 +41,6 @@ add_start_docstrings_to_model_forward, is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, - is_torchdynamo_compiling, logging, replace_return_docstrings, ) @@ -1277,13 +1276,8 @@ def forward( ) hidden_states = outputs[0] - if labels is None and not is_torchdynamo_compiling(): - logger.warning_once( - "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)" - ) # Only compute necessary logits, and do not upcast them to float if we are not computing the loss - # TODO: remove the float() operation in v4.46 - logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float() + logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) loss = None if labels is not None: diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py index 9a970a4a1b2f..81a310bd26e7 100644 --- a/src/transformers/models/qwen2/modeling_qwen2.py +++ b/src/transformers/models/qwen2/modeling_qwen2.py @@ -44,7 +44,6 @@ add_start_docstrings_to_model_forward, is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, - is_torchdynamo_compiling, logging, replace_return_docstrings, ) @@ -1180,13 +1179,8 @@ def forward( ) hidden_states = outputs[0] - if labels is None and not is_torchdynamo_compiling(): - logger.warning_once( - "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)" - ) # Only compute necessary logits, and do not upcast them to float if we are not computing the loss - # TODO: remove the float() operation in v4.46 - logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float() + logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) loss = None if labels is not None: diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index 2274e96245d3..7eba44c94440 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -45,7 +45,6 @@ add_start_docstrings_to_model_forward, is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, - is_torchdynamo_compiling, logging, replace_return_docstrings, ) @@ -1368,13 +1367,8 @@ def forward( ) hidden_states = outputs[0] - if labels is None and not is_torchdynamo_compiling(): - logger.warning_once( - "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)" - ) # Only compute necessary logits, and do not upcast them to float if we are not computing the loss - # TODO: remove the float() operation in v4.46 - logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float() + logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) loss = None if labels is not None: diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py index 89a36fefe77a..0f196118de56 100644 --- a/src/transformers/models/starcoder2/modeling_starcoder2.py +++ b/src/transformers/models/starcoder2/modeling_starcoder2.py @@ -44,7 +44,6 @@ add_start_docstrings_to_model_forward, is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, - is_torchdynamo_compiling, logging, replace_return_docstrings, ) @@ -1155,13 +1154,8 @@ def forward( ) hidden_states = outputs[0] - if labels is None and not is_torchdynamo_compiling(): - logger.warning_once( - "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)" - ) # Only compute necessary logits, and do not upcast them to float if we are not computing the loss - # TODO: remove the float() operation in v4.46 - logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float() + logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) loss = None if labels is not None: