diff --git a/docs/source/en/model_doc/qwen2_5_omni.md b/docs/source/en/model_doc/qwen2_5_omni.md index baa4c7b87b50..8153589e5cca 100644 --- a/docs/source/en/model_doc/qwen2_5_omni.md +++ b/docs/source/en/model_doc/qwen2_5_omni.md @@ -136,7 +136,7 @@ inputs = processor.apply_chat_template( tokenize=True, return_dict=True, return_tensors="pt", - video_fps=1, + fps=1, # kwargs to be passed to `Qwen2-5-OmniProcessor` padding=True, @@ -245,7 +245,7 @@ inputs = processor.apply_chat_template( tokenize=True, return_dict=True, return_tensors="pt", - video_fps=1, + fps=1, # kwargs to be passed to `Qwen2-5-OmniProcessor` padding=True, diff --git a/docs/source/en/model_doc/qwen2_audio.md b/docs/source/en/model_doc/qwen2_audio.md index ea52a19b39db..76e16315c3e5 100644 --- a/docs/source/en/model_doc/qwen2_audio.md +++ b/docs/source/en/model_doc/qwen2_audio.md @@ -54,7 +54,7 @@ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B", trust_remote_co prompt = "<|audio_bos|><|AUDIO|><|audio_eos|>Generate the caption in English:" url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/glass-breaking-151256.mp3" audio, sr = librosa.load(BytesIO(urlopen(url).read()), sr=processor.feature_extractor.sampling_rate) -inputs = processor(text=prompt, audios=audio, return_tensors="pt").to(model.device) +inputs = processor(text=prompt, audio=audio, return_tensors="pt").to(model.device) generate_ids = model.generate(**inputs, max_length=256) generate_ids = generate_ids[:, inputs.input_ids.size(1):] @@ -63,7 +63,7 @@ response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_ # We can also omit the audio_bos and audio_eos tokens prompt = "<|AUDIO|>Generate the caption in English:" -inputs = processor(text=prompt, audios=audio, return_tensors="pt").to(model.device) +inputs = processor(text=prompt, audio=audio, return_tensors="pt").to(model.device) generate_ids = model.generate(**inputs, max_length=256) generate_ids = generate_ids[:, inputs.input_ids.size(1):] @@ -106,7 +106,7 @@ for message in conversation: sr=processor.feature_extractor.sampling_rate)[0] ) -inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True) +inputs = processor(text=text, audio=audios, return_tensors="pt", padding=True) inputs.input_ids = inputs.input_ids.to(model.device) generate_ids = model.generate(**inputs, max_length=256) @@ -156,7 +156,7 @@ for message in conversation: sr=processor.feature_extractor.sampling_rate)[0] ) -inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True) +inputs = processor(text=text, audio=audios, return_tensors="pt", padding=True) inputs.input_ids = inputs.input_ids.to(model.device) generate_ids = model.generate(**inputs, max_length=256) @@ -213,7 +213,7 @@ for conversation in conversations: sr=processor.feature_extractor.sampling_rate)[0] ) -inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True) +inputs = processor(text=text, audio=audios, return_tensors="pt", padding=True) inputs['input_ids'] = inputs['input_ids'].to(model.device) inputs.input_ids = inputs.input_ids.to(model.device) diff --git a/docs/source/en/model_doc/qwen3_omni_moe.md b/docs/source/en/model_doc/qwen3_omni_moe.md index efbea2613cf7..75dea355e4bb 100644 --- a/docs/source/en/model_doc/qwen3_omni_moe.md +++ b/docs/source/en/model_doc/qwen3_omni_moe.md @@ -80,7 +80,7 @@ inputs = processor.apply_chat_template( tokenize=True, return_dict=True, return_tensors="pt", - video_fps=1, + fps=1, # kwargs to be passed to `Qwen3OmniMoeProcessor` padding=True, @@ -136,7 +136,7 @@ inputs = processor.apply_chat_template( tokenize=True, return_dict=True, return_tensors="pt", - video_fps=1, + fps=1, # kwargs to be passed to `Qwen3OmniMoeProcessor` padding=True, @@ -245,7 +245,7 @@ inputs = processor.apply_chat_template( tokenize=True, return_dict=True, return_tensors="pt", - video_fps=1, + fps=1, # kwargs to be passed to `Qwen3OmniMoeProcessor` padding=True, diff --git a/docs/source/en/model_doc/seamless_m4t.md b/docs/source/en/model_doc/seamless_m4t.md index e7fc00d047c3..8415c94f8501 100644 --- a/docs/source/en/model_doc/seamless_m4t.md +++ b/docs/source/en/model_doc/seamless_m4t.md @@ -61,7 +61,7 @@ Here is how to use the processor to process text and audio: >>> audio_sample = next(iter(dataset))["audio"] >>> # now, process it ->>> audio_inputs = processor(audios=audio_sample["array"], return_tensors="pt") +>>> audio_inputs = processor(audio=audio_sample["array"], return_tensors="pt") >>> # now, process some English test as well >>> text_inputs = processor(text = "Hello, my dog is cute", src_lang="eng", return_tensors="pt") diff --git a/docs/source/en/model_doc/seamless_m4t_v2.md b/docs/source/en/model_doc/seamless_m4t_v2.md index 4a32199243ab..c92e5c00527b 100644 --- a/docs/source/en/model_doc/seamless_m4t_v2.md +++ b/docs/source/en/model_doc/seamless_m4t_v2.md @@ -61,7 +61,7 @@ Here is how to use the processor to process text and audio: >>> audio_sample = next(iter(dataset))["audio"] >>> # now, process it ->>> audio_inputs = processor(audios=audio_sample["array"], return_tensors="pt") +>>> audio_inputs = processor(audio=audio_sample["array"], return_tensors="pt") >>> # now, process some English text as well >>> text_inputs = processor(text = "Hello, my dog is cute", src_lang="eng", return_tensors="pt") diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py index 635a947bbdeb..7f5634e625fa 100644 --- a/examples/pytorch/audio-classification/run_audio_classification.py +++ b/examples/pytorch/audio-classification/run_audio_classification.py @@ -27,7 +27,6 @@ import logging import os import sys -import warnings from dataclasses import dataclass, field from random import randint from typing import Optional @@ -180,29 +179,11 @@ class ModelArguments: ) }, ) - freeze_feature_extractor: Optional[bool] = field( - default=None, metadata={"help": "Whether to freeze the feature extractor layers of the model."} - ) ignore_mismatched_sizes: bool = field( default=False, metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."}, ) - def __post_init__(self): - if not self.freeze_feature_extractor and self.freeze_feature_encoder: - warnings.warn( - "The argument `--freeze_feature_extractor` is deprecated and " - "will be removed in a future version. Use `--freeze_feature_encoder` " - "instead. Setting `freeze_feature_encoder==True`.", - FutureWarning, - ) - if self.freeze_feature_extractor and not self.freeze_feature_encoder: - raise ValueError( - "The argument `--freeze_feature_extractor` is deprecated and " - "should not be used in combination with `--freeze_feature_encoder`. " - "Only make use of `--freeze_feature_encoder`." - ) - def main(): # See all possible arguments in src/transformers/training_args.py diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py index f44879e37b02..248622d3b790 100644 --- a/src/transformers/models/aimv2/modeling_aimv2.py +++ b/src/transformers/models/aimv2/modeling_aimv2.py @@ -37,7 +37,6 @@ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, filter_out_non_signature_kwargs -from ...utils.deprecation import deprecate_kwarg from ...utils.generic import check_model_inputs from .configuration_aimv2 import Aimv2Config, Aimv2TextConfig, Aimv2VisionConfig @@ -445,13 +444,11 @@ def __init__(self, config: Aimv2VisionConfig): def get_input_embeddings(self) -> nn.Module: return self.embeddings.patch_embed - @deprecate_kwarg("attention_mask", version="v4.58.0") @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, pixel_values, - attention_mask: Optional[torch.Tensor] = None, **kwargs: Unpack[TransformersKwargs], ) -> BaseModelOutputWithPooling: r""" diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py index a7ea96f8f2c2..d67ce3785e9b 100644 --- a/src/transformers/models/aimv2/modular_aimv2.py +++ b/src/transformers/models/aimv2/modular_aimv2.py @@ -32,7 +32,6 @@ auto_docstring, can_return_tuple, ) -from ...utils.deprecation import deprecate_kwarg from ...utils.generic import check_model_inputs from ..clip.modeling_clip import CLIPModel, CLIPTextEmbeddings, _get_vector_norm from ..llama.modeling_llama import LlamaMLP, LlamaRMSNorm @@ -488,13 +487,11 @@ def __init__(self, config: Aimv2VisionConfig): def get_input_embeddings(self) -> nn.Module: return self.embeddings.patch_embed - @deprecate_kwarg("attention_mask", version="v4.58.0") @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, pixel_values, - attention_mask: Optional[torch.Tensor] = None, **kwargs: Unpack[TransformersKwargs], ) -> BaseModelOutputWithPooling: r""" diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py index 933a5e48dfed..e47b5281010f 100644 --- a/src/transformers/models/altclip/processing_altclip.py +++ b/src/transformers/models/altclip/processing_altclip.py @@ -17,7 +17,6 @@ """ from ...processing_utils import ProcessorMixin -from ...utils.deprecation import deprecate_kwarg class AltCLIPProcessor(ProcessorMixin): @@ -35,7 +34,6 @@ class AltCLIPProcessor(ProcessorMixin): The tokenizer is a required input. """ - @deprecate_kwarg(old_name="feature_extractor", version="5.0.0", new_name="image_processor") def __init__(self, image_processor=None, tokenizer=None): super().__init__(image_processor, tokenizer) diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py index fff3158ab387..3e5ff85a2445 100755 --- a/src/transformers/models/beit/modeling_beit.py +++ b/src/transformers/models/beit/modeling_beit.py @@ -16,7 +16,6 @@ import collections.abc import math -import warnings from dataclasses import dataclass from typing import Optional, Union @@ -163,14 +162,7 @@ def forward( self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.BoolTensor] = None, - interpolate_pos_encoding: Optional[bool] = None, ) -> torch.Tensor: - if self.position_embeddings is not None and interpolate_pos_encoding is not None: - warnings.warn( - "`interpolate_pos_encoding` argument has no effect for BEiTEmbeddings, embeddings are always " - "interpolated to the input image size. The argument will be removed in transformers v4.51.0." - ) - _, _, height, width = pixel_values.shape embeddings, (patch_height, patch_width) = self.patch_embeddings(pixel_values) batch_size, seq_len, _ = embeddings.size() @@ -325,19 +317,9 @@ def forward( ) -> Union[tuple[torch.Tensor], tuple[torch.Tensor, torch.Tensor]]: if output_attentions: logger.warning_once( - "`BeitSdpaSelfAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not " - "support `output_attentions=True`. Falling back to the manual attention implementation, " - "but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. " - 'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - return super().forward( - hidden_states=hidden_states, - output_attentions=output_attentions, - relative_position_bias=relative_position_bias, - interpolate_pos_encoding=interpolate_pos_encoding, - resolution=resolution, + f"{self.__class__.__name__} does not support `output_attentions=True`. The returned attention weights will " + "be `None`. If you want to get attention weights, please set `attn_implementation='eager'` when loading the model." ) - batch_size, seq_length, _ = hidden_states.shape query_layer = ( self.query(hidden_states) diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index 806b08469f6f..aded7f6bf64d 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -15,7 +15,6 @@ """PyTorch BLIP-2 model.""" import math -import warnings from collections.abc import Callable from dataclasses import dataclass from typing import Any, Optional, Union @@ -1090,7 +1089,6 @@ def get_text_features( decoder_input_ids: Optional[torch.Tensor] = None, decoder_attention_mask: Optional[torch.Tensor] = None, labels: Optional[torch.Tensor] = None, - legacy_output: bool = True, ) -> Union[torch.FloatTensor, CausalLMOutputWithPast]: r""" decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): @@ -1109,12 +1107,10 @@ def get_text_features( decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*): Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default. - legacy_output (`bool`, *optional*, defaults to `True`): - Whether to return a model output object or a tensor of features. Returns: - text_outputs (`CausalLMOutputWithPast` or `torch.FloatTensor`): - The language model outputs. If `legacy_output=False`, the output is a `torch.FloatTensor`. + text_outputs (``torch.FloatTensor`): + The language model's last hidden states. Examples: ```python @@ -1129,13 +1125,6 @@ def get_text_features( ... text_features = model.get_text_features(**inputs) ```""" - if legacy_output: - warnings.warn( - "Deprecation notice: In Transformers v4.59, the default return value of `get_text_features` will change. " - "Currently, this method returns a model output object, but starting in v4.59, it will return a tensor instead. " - "To opt in to the new behavior now, set `legacy_output=False`." - ) - if self.config.use_decoder_only_language_model: text_outputs: CausalLMOutputWithPast = self.language_model( input_ids=input_ids, @@ -1153,7 +1142,7 @@ def get_text_features( return_dict=True, ) - return text_outputs if legacy_output else text_outputs.logits + return text_outputs.logits @filter_out_non_signature_kwargs() @auto_docstring @@ -1161,15 +1150,11 @@ def get_image_features( self, pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False, - legacy_output: bool = True, ) -> Union[torch.FloatTensor, CausalLMOutputWithPast]: r""" - legacy_output (`bool`, *optional*, defaults to `True`): - Whether to return a model output object or a tensor of features. - Returns: - vision_outputs (`BaseModelOutputWithPooling` or `torch.FloatTensor`): - The vision model outputs. If `legacy_output=False`, the output is a `torch.FloatTensor`. + vision_outputs (`torch.FloatTensor`): + The vision model's last layer pooled logits. Examples: ```python @@ -1187,20 +1172,13 @@ def get_image_features( >>> with torch.inference_mode(): ... image_outputs = model.get_image_features(**inputs) ```""" - if legacy_output: - warnings.warn( - "Deprecation notice: In Transformers v4.59, the default return value of `get_text_features` will change. " - "Currently, this method returns a model output object, but starting in v4.59, it will return a tensor instead. " - "To opt in to the new behavior now, set `legacy_output=False`." - ) - vision_outputs = self.vision_model( pixel_values=pixel_values, interpolate_pos_encoding=interpolate_pos_encoding, return_dict=True, ) - return vision_outputs if legacy_output else vision_outputs.pooler_output + return vision_outputs.pooler_output @filter_out_non_signature_kwargs() @auto_docstring @@ -1208,15 +1186,11 @@ def get_qformer_features( self, pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False, - legacy_output: bool = True, ) -> Union[torch.FloatTensor, BaseModelOutputWithPooling]: r""" - legacy_output (`bool`, *optional*, defaults to `True`): - Whether to return a model output object or a tensor of features. - Returns: - qformer_outputs (`BaseModelOutputWithPooling` or `torch.FloatTensor`): - The Q-Former outputs. If `legacy_output=False`, the output is a `torch.FloatTensor`. + qformer_outputs (`torch.FloatTensor`): + The Q-Former model's last layer hidden states. Examples: @@ -1235,14 +1209,6 @@ def get_qformer_features( >>> with torch.inference_mode(): ... qformer_outputs = model.get_qformer_features(**inputs) ```""" - - if legacy_output: - warnings.warn( - "Deprecation notice: In Transformers v4.59, the default return value of `get_qformer_features` will change. " - "Currently, this method returns a model output object, but starting in v4.59, it will return a tensor instead. " - "To opt in to the new behavior now, set `legacy_output=False`." - ) - vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, interpolate_pos_encoding=interpolate_pos_encoding, @@ -1262,7 +1228,7 @@ def get_qformer_features( return_dict=True, ) - return query_outputs if legacy_output else query_outputs.last_hidden_state + return query_outputs.last_hidden_state def get_placeholder_mask(self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor): """ diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py index af63b5ef66f2..2c7703811e6c 100644 --- a/src/transformers/models/bloom/modeling_bloom.py +++ b/src/transformers/models/bloom/modeling_bloom.py @@ -15,7 +15,6 @@ """PyTorch BLOOM model.""" import math -import warnings from typing import Optional, Union import torch @@ -484,7 +483,6 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, - **deprecated_arguments, ) -> Union[tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]: r""" input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): @@ -499,16 +497,6 @@ def forward( [What are input IDs?](../glossary#input-ids) """ - if deprecated_arguments.pop("position_ids", False) is not False: - # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None` - warnings.warn( - "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" - " passing `position_ids`.", - FutureWarning, - ) - if len(deprecated_arguments) > 0: - raise ValueError(f"Got unexpected arguments: {deprecated_arguments}") - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -817,7 +805,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, logits_to_keep: Union[int, torch.Tensor] = 0, - **deprecated_arguments, + **kwargs, ) -> Union[tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: r""" input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): @@ -836,18 +824,6 @@ def forward( `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` """ - # Bloom has deprecated kwargs, so we need to pop num_items_in_batch explicitly - num_items_in_batch = deprecated_arguments.pop("num_items_in_batch", None) - if deprecated_arguments.pop("position_ids", False) is not False: - # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None` - warnings.warn( - "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" - " passing `position_ids`.", - FutureWarning, - ) - if len(deprecated_arguments) > 0: - raise ValueError(f"Got unexpected arguments: {deprecated_arguments}") - return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( @@ -873,7 +849,7 @@ def forward( logits, labels, vocab_size=self.config.vocab_size, - num_items_in_batch=num_items_in_batch, + num_items_in_batch=kwargs.get("num_items_in_batch"), ) if not return_dict: @@ -925,7 +901,6 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - **deprecated_arguments, ) -> Union[tuple[torch.Tensor], SequenceClassifierOutputWithPast]: r""" input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): @@ -944,16 +919,6 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - if deprecated_arguments.pop("position_ids", False) is not False: - # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None` - warnings.warn( - "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" - " passing `position_ids`.", - FutureWarning, - ) - if len(deprecated_arguments) > 0: - raise ValueError(f"Got unexpected arguments: {deprecated_arguments}") - return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( @@ -1059,7 +1024,6 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - **deprecated_arguments, ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]: r""" input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): @@ -1078,16 +1042,6 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - if deprecated_arguments.pop("position_ids", False) is not False: - # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None` - warnings.warn( - "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" - " passing `position_ids`.", - FutureWarning, - ) - if len(deprecated_arguments) > 0: - raise ValueError(f"Got unexpected arguments: {deprecated_arguments}") - return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( @@ -1142,7 +1096,6 @@ def forward( self, input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.FloatTensor] = None, - position_ids: Optional[torch.LongTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, start_positions: Optional[torch.LongTensor] = None, end_positions: Optional[torch.LongTensor] = None, @@ -1168,7 +1121,6 @@ def forward( outputs = self.transformer( input_ids, attention_mask=attention_mask, - position_ids=position_ids, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py index 73bfc7407666..c167817937b7 100644 --- a/src/transformers/models/bridgetower/image_processing_bridgetower.py +++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py @@ -192,9 +192,6 @@ def __init__( do_pad: bool = True, **kwargs, ) -> None: - if "pad_and_return_pixel_mask" in kwargs: - do_pad = kwargs.pop("pad_and_return_pixel_mask") - super().__init__(**kwargs) size = size if size is not None else {"shortest_edge": 288} size = get_size_dict(size, default_to_square=False) @@ -208,7 +205,7 @@ def __init__( self.do_normalize = do_normalize self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD - self.do_pad = do_pad + self.do_pad = kwargs.pop("pad_and_return_pixel_mask", do_pad) self.do_center_crop = do_center_crop self.crop_size = crop_size diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index 89ad2ec26a61..276e4e39ea7f 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -1372,7 +1372,7 @@ def forward( >>> model = ClapAudioModel.from_pretrained("laion/clap-htsat-fused") >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-fused") - >>> inputs = processor(audios=audio_sample, return_tensors="pt") + >>> inputs = processor(audio=audio_sample, return_tensors="pt") >>> outputs = model(**inputs) >>> last_hidden_state = outputs.last_hidden_state @@ -1647,7 +1647,7 @@ def forward( >>> input_text = ["Sound of a dog", "Sound of vacuum cleaner"] - >>> inputs = processor(text=input_text, audios=audio_sample, return_tensors="pt", padding=True) + >>> inputs = processor(text=input_text, audio=audio_sample, return_tensors="pt", padding=True) >>> outputs = model(**inputs) >>> logits_per_audio = outputs.logits_per_audio # this is the audio-text similarity score @@ -1819,7 +1819,7 @@ def forward( >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example") >>> audio_sample = dataset["train"]["audio"][0]["array"] - >>> inputs = processor(audios=audio_sample, return_tensors="pt") + >>> inputs = processor(audio=audio_sample, return_tensors="pt") >>> outputs = model(**inputs) >>> audio_embeds = outputs.audio_embeds ```""" diff --git a/src/transformers/models/clap/processing_clap.py b/src/transformers/models/clap/processing_clap.py index a72151cb9b63..cbec5473e458 100644 --- a/src/transformers/models/clap/processing_clap.py +++ b/src/transformers/models/clap/processing_clap.py @@ -16,13 +16,8 @@ Audio/Text processor class for CLAP """ -from typing import Optional, Union - -from ...audio_utils import AudioInput -from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack -from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...processing_utils import ProcessorMixin from ...utils import logging -from ...utils.deprecation import deprecate_kwarg logger = logging.get_logger(__name__) @@ -45,28 +40,5 @@ class ClapProcessor(ProcessorMixin): def __init__(self, feature_extractor, tokenizer): super().__init__(feature_extractor, tokenizer) - @deprecate_kwarg("audios", version="v4.59.0", new_name="audio") - def __call__( - self, - text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None, - audios: Optional[AudioInput] = None, - audio: Optional[AudioInput] = None, - **kwargs: Unpack[ProcessingKwargs], - ): - """ - Forwards the `audio` and `sampling_rate` arguments to [`~ClapFeatureExtractor.__call__`] and the `text` - argument to [`~RobertaTokenizerFast.__call__`]. Please refer to the docstring of the above two methods for more - information. - """ - # The `deprecate_kwarg` will not work if the inputs are passed as arguments, so we check - # again that the correct naming is used - if audios is not None and audio is None: - logger.warning( - "Using `audios` keyword argument is deprecated when calling ClapProcessor, instead use `audio`." - ) - audio = audios - - return super().__call__(text=text, audio=audio, **kwargs) - __all__ = ["ClapProcessor"] diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py index 3f639e0c1ae3..7d7aff908db8 100644 --- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py +++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py @@ -826,18 +826,7 @@ def __init__( pad_size: Optional[dict[str, int]] = None, **kwargs, ) -> None: - if "pad_and_return_pixel_mask" in kwargs: - do_pad = kwargs.pop("pad_and_return_pixel_mask") - - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` parameter is deprecated and will be removed in v4.26. " - "Please specify in `size['longest_edge'] instead`.", - ) - max_size = kwargs.pop("max_size") - else: - max_size = None if size is None else 1333 - + max_size = None if size is None else kwargs.pop("max_size", 1333) size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} size = get_size_dict(size, max_size=max_size, default_to_square=False) @@ -856,7 +845,7 @@ def __init__( self.do_convert_annotations = do_convert_annotations self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD - self.do_pad = do_pad + self.do_pad = kwargs.pop("pad_and_return_pixel_mask", do_pad) self.pad_size = pad_size self._valid_processor_keys = [ "images", @@ -880,21 +869,6 @@ def __init__( "input_data_format", ] - @classmethod - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->ConditionalDetr - def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): - """ - Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is - created using from_dict and kwargs e.g. `ConditionalDetrImageProcessor.from_pretrained(checkpoint, size=600, - max_size=800)` - """ - image_processor_dict = image_processor_dict.copy() - if "max_size" in kwargs: - image_processor_dict["max_size"] = kwargs.pop("max_size") - if "pad_and_return_pixel_mask" in kwargs: - image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") - return super().from_dict(image_processor_dict, **kwargs) - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->ConditionalDetr def prepare_annotation( self, @@ -963,15 +937,7 @@ def resize( input_data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format of the input image. If not provided, it will be inferred. """ - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` parameter is deprecated and will be removed in v4.26. " - "Please specify in `size['longest_edge'] instead`.", - ) - max_size = kwargs.pop("max_size") - else: - max_size = None - size = get_size_dict(size, max_size=max_size, default_to_square=False) + size = get_size_dict(size, max_size=None, default_to_square=False) if "shortest_edge" in size and "longest_edge" in size: new_size = get_resize_output_image_size( image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format @@ -1308,19 +1274,6 @@ def preprocess( provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest height and width in the batch. """ - if "pad_and_return_pixel_mask" in kwargs: - logger.warning_once( - "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, " - "use `do_pad` instead." - ) - do_pad = kwargs.pop("pad_and_return_pixel_mask") - - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` argument is deprecated and will be removed in a future version, use" - " `size['longest_edge']` instead." - ) - size = kwargs.pop("max_size") do_resize = self.do_resize if do_resize is None else do_resize size = self.size if size is None else size @@ -1472,50 +1425,6 @@ def preprocess( return encoded_inputs - def post_process(self, outputs, target_sizes): - """ - Converts the output of [`ConditionalDetrForObjectDetection`] into the format expected by the Pascal VOC format (xmin, ymin, xmax, ymax). - - Args: - outputs ([`ConditionalDetrObjectDetectionOutput`]): - Raw outputs of the model. - target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): - Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original - image size (before any data augmentation). For visualization, this should be the image size after data - augment, but before padding. - Returns: - `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image - in the batch as predicted by the model. - """ - logging.warning_once( - "`post_process` is deprecated and will be removed in v5 of Transformers, please use" - " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", - ) - - out_logits, out_bbox = outputs.logits, outputs.pred_boxes - - if len(out_logits) != len(target_sizes): - raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") - if target_sizes.shape[1] != 2: - raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") - - prob = out_logits.sigmoid() - topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 300, dim=1) - scores = topk_values - topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor") - labels = topk_indexes % out_logits.shape[2] - boxes = center_to_corners_format(out_bbox) - boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) - - # and from relative [0, 1] to absolute [0, height] coordinates - img_h, img_w = target_sizes.unbind(1) - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) - boxes = boxes * scale_fct[:, None, :] - - results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] - - return results - # Copied from transformers.models.deformable_detr.image_processing_deformable_detr.DeformableDetrImageProcessor.post_process_object_detection with DeformableDetr->ConditionalDetr def post_process_object_detection( self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, list[tuple]] = None, top_k: int = 100 diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr_fast.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr_fast.py index 4c5b8602c0cc..51afbd98bb85 100644 --- a/src/transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr_fast.py @@ -27,7 +27,6 @@ AnnotationFormat, AnnotationType, ChannelDimension, - ImageInput, PILImageResampling, get_image_size, validate_annotations, @@ -263,19 +262,10 @@ class ConditionalDetrImageProcessorFast(BaseImageProcessorFast): valid_kwargs = ConditionalDetrImageProcessorKwargs def __init__(self, **kwargs: Unpack[ConditionalDetrImageProcessorKwargs]) -> None: - if "pad_and_return_pixel_mask" in kwargs: - kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") + kwargs.setdefault("do_pad", kwargs.pop("pad_and_return_pixel_mask", self.do_pad)) size = kwargs.pop("size", None) - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` parameter is deprecated and will be removed in v4.26. " - "Please specify in `size['longest_edge'] instead`.", - ) - max_size = kwargs.pop("max_size") - else: - max_size = None if size is None else 1333 - + max_size = None if size is None else kwargs.pop("max_size", 1333) size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} self.size = get_size_dict(size, max_size=max_size, default_to_square=False) @@ -287,20 +277,6 @@ def __init__(self, **kwargs: Unpack[ConditionalDetrImageProcessorKwargs]) -> Non super().__init__(**kwargs) - @classmethod - def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): - """ - Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is - created using from_dict and kwargs e.g. `ConditionalDetrImageProcessorFast.from_pretrained(checkpoint, size=600, - max_size=800)` - """ - image_processor_dict = image_processor_dict.copy() - if "max_size" in kwargs: - image_processor_dict["max_size"] = kwargs.pop("max_size") - if "pad_and_return_pixel_mask" in kwargs: - image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") - return super().from_dict(image_processor_dict, **kwargs) - def prepare_annotation( self, image: torch.Tensor, @@ -520,28 +496,6 @@ def pad( return image, pixel_mask, annotation - @auto_docstring - def preprocess( - self, - images: ImageInput, - **kwargs: Unpack[ConditionalDetrImageProcessorKwargs], - ) -> BatchFeature: - if "pad_and_return_pixel_mask" in kwargs: - kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") - logger.warning_once( - "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, " - "use `do_pad` instead." - ) - - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` argument is deprecated and will be removed in a future version, use" - " `size['longest_edge']` instead." - ) - kwargs["size"] = kwargs.pop("max_size") - - return super().preprocess(images, **kwargs) - def _preprocess( self, images: list["torch.Tensor"], @@ -658,51 +612,6 @@ def _preprocess( ] return encoded_inputs - def post_process(self, outputs, target_sizes): - """ - Converts the output of [`ConditionalDetrForObjectDetection`] into the format expected by the Pascal VOC format (xmin, ymin, xmax, ymax). - Only supports PyTorch. - - Args: - outputs ([`ConditionalDetrObjectDetectionOutput`]): - Raw outputs of the model. - target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): - Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original - image size (before any data augmentation). For visualization, this should be the image size after data - augment, but before padding. - Returns: - `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image - in the batch as predicted by the model. - """ - logging.warning_once( - "`post_process` is deprecated and will be removed in v5 of Transformers, please use" - " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", - ) - - out_logits, out_bbox = outputs.logits, outputs.pred_boxes - - if len(out_logits) != len(target_sizes): - raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") - if target_sizes.shape[1] != 2: - raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") - - prob = out_logits.sigmoid() - topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 300, dim=1) - scores = topk_values - topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor") - labels = topk_indexes % out_logits.shape[2] - boxes = center_to_corners_format(out_bbox) - boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) - - # and from relative [0, 1] to absolute [0, height] coordinates - img_h, img_w = target_sizes.unbind(1) - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) - boxes = boxes * scale_fct[:, None, :] - - results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] - - return results - def post_process_object_detection( self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, list[tuple]] = None, top_k: int = 100 ): diff --git a/src/transformers/models/conditional_detr/modular_conditional_detr.py b/src/transformers/models/conditional_detr/modular_conditional_detr.py index 9d0faf2c4b9e..c2cbdd26e31d 100644 --- a/src/transformers/models/conditional_detr/modular_conditional_detr.py +++ b/src/transformers/models/conditional_detr/modular_conditional_detr.py @@ -17,51 +17,6 @@ class ConditionalDetrImageProcessorFast(DetrImageProcessorFast): - def post_process(self, outputs, target_sizes): - """ - Converts the output of [`ConditionalDetrForObjectDetection`] into the format expected by the Pascal VOC format (xmin, ymin, xmax, ymax). - Only supports PyTorch. - - Args: - outputs ([`ConditionalDetrObjectDetectionOutput`]): - Raw outputs of the model. - target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): - Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original - image size (before any data augmentation). For visualization, this should be the image size after data - augment, but before padding. - Returns: - `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image - in the batch as predicted by the model. - """ - logging.warning_once( - "`post_process` is deprecated and will be removed in v5 of Transformers, please use" - " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", - ) - - out_logits, out_bbox = outputs.logits, outputs.pred_boxes - - if len(out_logits) != len(target_sizes): - raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") - if target_sizes.shape[1] != 2: - raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") - - prob = out_logits.sigmoid() - topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 300, dim=1) - scores = topk_values - topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor") - labels = topk_indexes % out_logits.shape[2] - boxes = center_to_corners_format(out_bbox) - boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) - - # and from relative [0, 1] to absolute [0, height] coordinates - img_h, img_w = target_sizes.unbind(1) - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) - boxes = boxes * scale_fct[:, None, :] - - results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] - - return results - def post_process_object_detection( self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, list[tuple]] = None, top_k: int = 100 ): @@ -121,14 +76,5 @@ def post_process_object_detection( return results - def post_process_segmentation(self): - raise NotImplementedError("Segmentation post-processing is not implemented for Conditional DETR yet.") - - def post_process_instance(self): - raise NotImplementedError("Instance post-processing is not implemented for Conditional DETR yet.") - - def post_process_panoptic(self): - raise NotImplementedError("Panoptic post-processing is not implemented for Conditional DETR yet.") - __all__ = ["ConditionalDetrImageProcessorFast"] diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py index 2559a29abca1..aee5b59f36cf 100755 --- a/src/transformers/models/data2vec/modeling_data2vec_audio.py +++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py @@ -838,18 +838,6 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameter will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will @@ -953,18 +941,6 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameters will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will @@ -1068,18 +1044,6 @@ def __init__(self, config): self.init_weights() - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameter will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will @@ -1236,18 +1200,6 @@ def __init__(self, config): self.init_weights() - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameter will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will diff --git a/src/transformers/models/data2vec/modeling_data2vec_vision.py b/src/transformers/models/data2vec/modeling_data2vec_vision.py index b51d7ed0f5d5..74b852b14f7d 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_vision.py +++ b/src/transformers/models/data2vec/modeling_data2vec_vision.py @@ -16,7 +16,6 @@ import collections.abc import math -import warnings from dataclasses import dataclass from typing import Optional, Union @@ -162,14 +161,7 @@ def forward( self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.BoolTensor] = None, - interpolate_pos_encoding: Optional[bool] = None, ) -> torch.Tensor: - if self.position_embeddings is not None and interpolate_pos_encoding is not None: - warnings.warn( - "`interpolate_pos_encoding` argument has no effect for BEiTEmbeddings, embeddings are always " - "interpolated to the input image size. The argument will be removed in transformers v4.51.0." - ) - _, _, height, width = pixel_values.shape embeddings, (patch_height, patch_width) = self.patch_embeddings(pixel_values) batch_size, seq_len, _ = embeddings.size() @@ -327,19 +319,9 @@ def forward( ) -> Union[tuple[torch.Tensor], tuple[torch.Tensor, torch.Tensor]]: if output_attentions: logger.warning_once( - "`Data2VecVisionSdpaSelfAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not " - "support `output_attentions=True`. Falling back to the manual attention implementation, " - "but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. " - 'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - return super().forward( - hidden_states=hidden_states, - output_attentions=output_attentions, - relative_position_bias=relative_position_bias, - interpolate_pos_encoding=interpolate_pos_encoding, - resolution=resolution, + f"{self.__class__.__name__} does not support `output_attentions=True`. The returned attention weights will " + "be `None`. If you want to get attention weights, please set `attn_implementation='eager'` when loading the model." ) - batch_size, seq_length, _ = hidden_states.shape query_layer = ( self.query(hidden_states) diff --git a/src/transformers/models/data2vec/modular_data2vec_audio.py b/src/transformers/models/data2vec/modular_data2vec_audio.py index 142bf7a5e783..3a9f5564d2f8 100644 --- a/src/transformers/models/data2vec/modular_data2vec_audio.py +++ b/src/transformers/models/data2vec/modular_data2vec_audio.py @@ -200,9 +200,6 @@ def __init__(self, config: Data2VecAudioConfig): # Initialize weights and apply final processing self.post_init() - def freeze_feature_extractor(self): - raise AttributeError("Not needed for Data2VecAudio") - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will diff --git a/src/transformers/models/deepseek_v2/modeling_deepseek_v2.py b/src/transformers/models/deepseek_v2/modeling_deepseek_v2.py index a3f4eb0d3340..b122599b724d 100644 --- a/src/transformers/models/deepseek_v2/modeling_deepseek_v2.py +++ b/src/transformers/models/deepseek_v2/modeling_deepseek_v2.py @@ -19,7 +19,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import warnings from collections.abc import Callable from typing import Optional, Union @@ -339,10 +338,6 @@ def forward( position_ids: Optional[torch.Tensor] = None, **kwargs, ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) batch_size, seq_length = hidden_states.shape[:-1] query_shape = (batch_size, seq_length, -1, self.qk_head_dim) key_shape = (batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim) diff --git a/src/transformers/models/deepseek_v2/modular_deepseek_v2.py b/src/transformers/models/deepseek_v2/modular_deepseek_v2.py index 7e60d5c858b3..32a9749fe823 100644 --- a/src/transformers/models/deepseek_v2/modular_deepseek_v2.py +++ b/src/transformers/models/deepseek_v2/modular_deepseek_v2.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import warnings from collections.abc import Callable from typing import Optional @@ -367,10 +366,6 @@ def forward( position_ids: Optional[torch.Tensor] = None, **kwargs, ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) batch_size, seq_length = hidden_states.shape[:-1] query_shape = (batch_size, seq_length, -1, self.qk_head_dim) key_shape = (batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim) diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py index 83587f45c295..14b9aa31b5eb 100644 --- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py +++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py @@ -831,18 +831,7 @@ def __init__( pad_size: Optional[dict[str, int]] = None, **kwargs, ) -> None: - if "pad_and_return_pixel_mask" in kwargs: - do_pad = kwargs.pop("pad_and_return_pixel_mask") - - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` parameter is deprecated and will be removed in v4.26. " - "Please specify in `size['longest_edge'] instead`.", - ) - max_size = kwargs.pop("max_size") - else: - max_size = None if size is None else 1333 - + max_size = None if size is None else kwargs.pop("max_size", 1333) size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} size = get_size_dict(size, max_size=max_size, default_to_square=False) @@ -861,7 +850,7 @@ def __init__( self.do_convert_annotations = do_convert_annotations self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD - self.do_pad = do_pad + self.do_pad = kwargs.pop("pad_and_return_pixel_mask", do_pad) self.pad_size = pad_size self._valid_processor_keys = [ "images", @@ -885,21 +874,6 @@ def __init__( "input_data_format", ] - @classmethod - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->DeformableDetr - def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): - """ - Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is - created using from_dict and kwargs e.g. `DeformableDetrImageProcessor.from_pretrained(checkpoint, size=600, - max_size=800)` - """ - image_processor_dict = image_processor_dict.copy() - if "max_size" in kwargs: - image_processor_dict["max_size"] = kwargs.pop("max_size") - if "pad_and_return_pixel_mask" in kwargs: - image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") - return super().from_dict(image_processor_dict, **kwargs) - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->DeformableDetr def prepare_annotation( self, @@ -968,15 +942,7 @@ def resize( input_data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format of the input image. If not provided, it will be inferred. """ - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` parameter is deprecated and will be removed in v4.26. " - "Please specify in `size['longest_edge'] instead`.", - ) - max_size = kwargs.pop("max_size") - else: - max_size = None - size = get_size_dict(size, max_size=max_size, default_to_square=False) + size = get_size_dict(size, max_size=None, default_to_square=False) if "shortest_edge" in size and "longest_edge" in size: new_size = get_resize_output_image_size( image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format @@ -1313,19 +1279,6 @@ def preprocess( provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest height and width in the batch. """ - if "pad_and_return_pixel_mask" in kwargs: - logger.warning_once( - "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, " - "use `do_pad` instead." - ) - do_pad = kwargs.pop("pad_and_return_pixel_mask") - - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` argument is deprecated and will be removed in a future version, use" - " `size['longest_edge']` instead." - ) - size = kwargs.pop("max_size") do_resize = self.do_resize if do_resize is None else do_resize size = self.size if size is None else size @@ -1477,51 +1430,6 @@ def preprocess( return encoded_inputs - def post_process(self, outputs, target_sizes): - """ - Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x, - top_left_y, bottom_right_x, bottom_right_y) format. - - Args: - outputs ([`DeformableDetrObjectDetectionOutput`]): - Raw outputs of the model. - target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): - Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the - original image size (before any data augmentation). For visualization, this should be the image size - after data augment, but before padding. - Returns: - `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image - in the batch as predicted by the model. - """ - logger.warning_once( - "`post_process` is deprecated and will be removed in v5 of Transformers, please use" - " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", - ) - - out_logits, out_bbox = outputs.logits, outputs.pred_boxes - - if len(out_logits) != len(target_sizes): - raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") - if target_sizes.shape[1] != 2: - raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") - - prob = out_logits.sigmoid() - topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1) - scores = topk_values - topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor") - labels = topk_indexes % out_logits.shape[2] - boxes = center_to_corners_format(out_bbox) - boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) - - # and from relative [0, 1] to absolute [0, height] coordinates - img_h, img_w = target_sizes.unbind(1) - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) - boxes = boxes * scale_fct[:, None, :] - - results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] - - return results - def post_process_object_detection( self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, list[tuple]] = None, top_k: int = 100 ): diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py index 916ad3dee0e6..d4a7ca2a8380 100644 --- a/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py @@ -26,19 +26,16 @@ AnnotationFormat, AnnotationType, ChannelDimension, - ImageInput, PILImageResampling, get_image_size, validate_annotations, ) from ...processing_utils import Unpack -from ...utils import TensorType, auto_docstring, logging +from ...utils import TensorType, auto_docstring from ...utils.import_utils import requires from .image_processing_deformable_detr import DeformableDetrImageProcessorKwargs, get_size_with_aspect_ratio -logger = logging.get_logger(__name__) - SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) @@ -256,19 +253,10 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast): valid_kwargs = DeformableDetrImageProcessorKwargs def __init__(self, **kwargs: Unpack[DeformableDetrImageProcessorKwargs]) -> None: - if "pad_and_return_pixel_mask" in kwargs: - kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") + kwargs.setdefault("do_pad", kwargs.pop("pad_and_return_pixel_mask", self.do_pad)) size = kwargs.pop("size", None) - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` parameter is deprecated and will be removed in v4.26. " - "Please specify in `size['longest_edge'] instead`.", - ) - max_size = kwargs.pop("max_size") - else: - max_size = None if size is None else 1333 - + max_size = None if size is None else kwargs.pop("max_size", 1333) size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} self.size = get_size_dict(size, max_size=max_size, default_to_square=False) @@ -280,20 +268,6 @@ def __init__(self, **kwargs: Unpack[DeformableDetrImageProcessorKwargs]) -> None super().__init__(**kwargs) - @classmethod - def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): - """ - Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is - created using from_dict and kwargs e.g. `DeformableDetrImageProcessorFast.from_pretrained(checkpoint, size=600, - max_size=800)` - """ - image_processor_dict = image_processor_dict.copy() - if "max_size" in kwargs: - image_processor_dict["max_size"] = kwargs.pop("max_size") - if "pad_and_return_pixel_mask" in kwargs: - image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") - return super().from_dict(image_processor_dict, **kwargs) - def prepare_annotation( self, image: torch.Tensor, @@ -513,28 +487,6 @@ def pad( return image, pixel_mask, annotation - @auto_docstring - def preprocess( - self, - images: ImageInput, - **kwargs: Unpack[DeformableDetrImageProcessorKwargs], - ) -> BatchFeature: - if "pad_and_return_pixel_mask" in kwargs: - kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") - logger.warning_once( - "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, " - "use `do_pad` instead." - ) - - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` argument is deprecated and will be removed in a future version, use" - " `size['longest_edge']` instead." - ) - kwargs["size"] = kwargs.pop("max_size") - - return super().preprocess(images, **kwargs) - def _preprocess( self, images: list["torch.Tensor"], @@ -651,51 +603,6 @@ def _preprocess( ] return encoded_inputs - def post_process(self, outputs, target_sizes): - """ - Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x, - top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch. - - Args: - outputs ([`DeformableDetrObjectDetectionOutput`]): - Raw outputs of the model. - target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): - Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the - original image size (before any data augmentation). For visualization, this should be the image size - after data augment, but before padding. - Returns: - `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image - in the batch as predicted by the model. - """ - logger.warning_once( - "`post_process` is deprecated and will be removed in v5 of Transformers, please use" - " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", - ) - - out_logits, out_bbox = outputs.logits, outputs.pred_boxes - - if len(out_logits) != len(target_sizes): - raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") - if target_sizes.shape[1] != 2: - raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") - - prob = out_logits.sigmoid() - topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1) - scores = topk_values - topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor") - labels = topk_indexes % out_logits.shape[2] - boxes = center_to_corners_format(out_bbox) - boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) - - # and from relative [0, 1] to absolute [0, height] coordinates - img_h, img_w = target_sizes.unbind(1) - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) - boxes = boxes * scale_fct[:, None, :] - - results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] - - return results - def post_process_object_detection( self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, list[tuple]] = None, top_k: int = 100 ): diff --git a/src/transformers/models/deformable_detr/modular_deformable_detr.py b/src/transformers/models/deformable_detr/modular_deformable_detr.py index 2e38df7845a2..450297519e1f 100644 --- a/src/transformers/models/deformable_detr/modular_deformable_detr.py +++ b/src/transformers/models/deformable_detr/modular_deformable_detr.py @@ -15,51 +15,6 @@ class DeformableDetrImageProcessorFast(DetrImageProcessorFast): - def post_process(self, outputs, target_sizes): - """ - Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x, - top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch. - - Args: - outputs ([`DeformableDetrObjectDetectionOutput`]): - Raw outputs of the model. - target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): - Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the - original image size (before any data augmentation). For visualization, this should be the image size - after data augment, but before padding. - Returns: - `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image - in the batch as predicted by the model. - """ - logger.warning_once( - "`post_process` is deprecated and will be removed in v5 of Transformers, please use" - " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", - ) - - out_logits, out_bbox = outputs.logits, outputs.pred_boxes - - if len(out_logits) != len(target_sizes): - raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") - if target_sizes.shape[1] != 2: - raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") - - prob = out_logits.sigmoid() - topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1) - scores = topk_values - topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor") - labels = topk_indexes % out_logits.shape[2] - boxes = center_to_corners_format(out_bbox) - boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) - - # and from relative [0, 1] to absolute [0, height] coordinates - img_h, img_w = target_sizes.unbind(1) - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) - boxes = boxes * scale_fct[:, None, :] - - results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] - - return results - def post_process_object_detection( self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, list[tuple]] = None, top_k: int = 100 ): @@ -119,15 +74,6 @@ def post_process_object_detection( return results - def post_process_segmentation(self): - raise NotImplementedError("Segmentation post-processing is not implemented for Deformable DETR yet.") - - def post_process_instance(self): - raise NotImplementedError("Instance post-processing is not implemented for Deformable DETR yet.") - - def post_process_panoptic(self): - raise NotImplementedError("Panoptic post-processing is not implemented for Deformable DETR yet.") - def post_process_instance_segmentation(self): raise NotImplementedError("Segmentation post-processing is not implemented for Deformable DETR yet.") diff --git a/src/transformers/models/deprecated/deta/image_processing_deta.py b/src/transformers/models/deprecated/deta/image_processing_deta.py index a1264cb188e2..b3fa7169c110 100644 --- a/src/transformers/models/deprecated/deta/image_processing_deta.py +++ b/src/transformers/models/deprecated/deta/image_processing_deta.py @@ -499,9 +499,6 @@ def __init__( pad_size: Optional[dict[str, int]] = None, **kwargs, ) -> None: - if "pad_and_return_pixel_mask" in kwargs: - do_pad = kwargs.pop("pad_and_return_pixel_mask") - size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} size = get_size_dict(size, default_to_square=False) @@ -519,7 +516,7 @@ def __init__( self.do_convert_annotations = do_convert_annotations self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD - self.do_pad = do_pad + self.do_pad = kwargs.pop("pad_and_return_pixel_mask", do_pad) self.pad_size = pad_size def prepare_annotation( diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py index 6f28e572c70d..370ea470291e 100644 --- a/src/transformers/models/detr/image_processing_detr.py +++ b/src/transformers/models/detr/image_processing_detr.py @@ -815,18 +815,7 @@ def __init__( pad_size: Optional[dict[str, int]] = None, **kwargs, ) -> None: - if "pad_and_return_pixel_mask" in kwargs: - do_pad = kwargs.pop("pad_and_return_pixel_mask") - - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` parameter is deprecated and will be removed in v4.26. " - "Please specify in `size['longest_edge'] instead`.", - ) - max_size = kwargs.pop("max_size") - else: - max_size = None if size is None else 1333 - + max_size = None if size is None else kwargs.pop("max_size", 1333) size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} size = get_size_dict(size, max_size=max_size, default_to_square=False) @@ -845,7 +834,7 @@ def __init__( self.do_convert_annotations = do_convert_annotations self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD - self.do_pad = do_pad + self.do_pad = kwargs.pop("pad_and_return_pixel_mask", do_pad) self.pad_size = pad_size self._valid_processor_keys = [ "images", @@ -869,20 +858,6 @@ def __init__( "input_data_format", ] - @classmethod - def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): - """ - Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is - created using from_dict and kwargs e.g. `DetrImageProcessor.from_pretrained(checkpoint, size=600, - max_size=800)` - """ - image_processor_dict = image_processor_dict.copy() - if "max_size" in kwargs: - image_processor_dict["max_size"] = kwargs.pop("max_size") - if "pad_and_return_pixel_mask" in kwargs: - image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") - return super().from_dict(image_processor_dict, **kwargs) - def prepare_annotation( self, image: np.ndarray, @@ -949,15 +924,7 @@ def resize( input_data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format of the input image. If not provided, it will be inferred. """ - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` parameter is deprecated and will be removed in v4.26. " - "Please specify in `size['longest_edge'] instead`.", - ) - max_size = kwargs.pop("max_size") - else: - max_size = None - size = get_size_dict(size, max_size=max_size, default_to_square=False) + size = get_size_dict(size, max_size=None, default_to_square=False) if "shortest_edge" in size and "longest_edge" in size: new_size = get_resize_output_image_size( image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format @@ -1288,19 +1255,6 @@ def preprocess( provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest height and width in the batch. """ - if "pad_and_return_pixel_mask" in kwargs: - logger.warning_once( - "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, " - "use `do_pad` instead." - ) - do_pad = kwargs.pop("pad_and_return_pixel_mask") - - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` argument is deprecated and will be removed in a future version, use" - " `size['longest_edge']` instead." - ) - size = kwargs.pop("max_size") do_resize = self.do_resize if do_resize is None else do_resize size = self.size if size is None else size @@ -1452,276 +1406,6 @@ def preprocess( return encoded_inputs - # inspired by https://github.com/facebookresearch/detr/blob/master/models/detr.py#L258 - def post_process(self, outputs, target_sizes): - """ - Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, - bottom_right_x, bottom_right_y) format. - - Args: - outputs ([`DetrObjectDetectionOutput`]): - Raw outputs of the model. - target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): - Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the - original image size (before any data augmentation). For visualization, this should be the image size - after data augment, but before padding. - Returns: - `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image - in the batch as predicted by the model. - """ - logger.warning_once( - "`post_process` is deprecated and will be removed in v5 of Transformers, please use" - " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", - ) - - out_logits, out_bbox = outputs.logits, outputs.pred_boxes - - if len(out_logits) != len(target_sizes): - raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") - if target_sizes.shape[1] != 2: - raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") - - prob = nn.functional.softmax(out_logits, -1) - scores, labels = prob[..., :-1].max(-1) - - # convert to [x0, y0, x1, y1] format - boxes = center_to_corners_format(out_bbox) - # and from relative [0, 1] to absolute [0, height] coordinates - img_h, img_w = target_sizes.unbind(1) - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) - boxes = boxes * scale_fct[:, None, :] - - results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] - return results - - def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5): - """ - Converts the output of [`DetrForSegmentation`] into image segmentation predictions. Only supports PyTorch. - - Args: - outputs ([`DetrSegmentationOutput`]): - Raw outputs of the model. - target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `list[Tuple]` of length `batch_size`): - Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction. - threshold (`float`, *optional*, defaults to 0.9): - Threshold to use to filter out queries. - mask_threshold (`float`, *optional*, defaults to 0.5): - Threshold to use when turning the predicted masks into binary values. - Returns: - `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an image - in the batch as predicted by the model. - """ - logger.warning_once( - "`post_process_segmentation` is deprecated and will be removed in v5 of Transformers, please use" - " `post_process_semantic_segmentation`.", - ) - out_logits, raw_masks = outputs.logits, outputs.pred_masks - empty_label = out_logits.shape[-1] - 1 - preds = [] - - def to_tuple(tup): - if isinstance(tup, tuple): - return tup - return tuple(tup.tolist()) - - for cur_logits, cur_masks, size in zip(out_logits, raw_masks, target_sizes): - # we filter empty queries and detection below threshold - cur_scores, cur_labels = cur_logits.softmax(-1).max(-1) - keep = cur_labels.ne(empty_label) & (cur_scores > threshold) - cur_scores = cur_scores[keep] - cur_labels = cur_labels[keep] - cur_masks = cur_masks[keep] - cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1) - cur_masks = (cur_masks.sigmoid() > mask_threshold) * 1 - - predictions = {"scores": cur_scores, "labels": cur_labels, "masks": cur_masks} - preds.append(predictions) - return preds - - # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218 - def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5): - """ - Converts the output of [`DetrForSegmentation`] into actual instance segmentation predictions. Only supports - PyTorch. - - Args: - results (`list[Dict]`): - Results list obtained by [`~DetrImageProcessor.post_process`], to which "masks" results will be added. - outputs ([`DetrSegmentationOutput`]): - Raw outputs of the model. - orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): - Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original - image size (before any data augmentation). - max_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): - Tensor containing the maximum size (h, w) of each image of the batch. For evaluation, this must be the - original image size (before any data augmentation). - threshold (`float`, *optional*, defaults to 0.5): - Threshold to use when turning the predicted masks into binary values. - Returns: - `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks for an - image in the batch as predicted by the model. - """ - logger.warning_once( - "`post_process_instance` is deprecated and will be removed in v5 of Transformers, please use" - " `post_process_instance_segmentation`.", - ) - - if len(orig_target_sizes) != len(max_target_sizes): - raise ValueError("Make sure to pass in as many orig_target_sizes as max_target_sizes") - max_h, max_w = max_target_sizes.max(0)[0].tolist() - outputs_masks = outputs.pred_masks.squeeze(2) - outputs_masks = nn.functional.interpolate( - outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False - ) - outputs_masks = (outputs_masks.sigmoid() > threshold).cpu() - - for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)): - img_h, img_w = t[0], t[1] - results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1) - results[i]["masks"] = nn.functional.interpolate( - results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest" - ).byte() - - return results - - # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241 - def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85): - """ - Converts the output of [`DetrForSegmentation`] into actual panoptic predictions. Only supports PyTorch. - - Args: - outputs ([`DetrSegmentationOutput`]): - Raw outputs of the model. - processed_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `list[Tuple]` of length `batch_size`): - Torch Tensor (or list) containing the size (h, w) of each image of the batch, i.e. the size after data - augmentation but before batching. - target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `list[Tuple]` of length `batch_size`, *optional*): - Torch Tensor (or list) corresponding to the requested final size `(height, width)` of each prediction. - If left to None, it will default to the `processed_sizes`. - is_thing_map (`torch.Tensor` of shape `(batch_size, 2)`, *optional*): - Dictionary mapping class indices to either True or False, depending on whether or not they are a thing. - If not set, defaults to the `is_thing_map` of COCO panoptic. - threshold (`float`, *optional*, defaults to 0.85): - Threshold to use to filter out queries. - Returns: - `list[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values for - an image in the batch as predicted by the model. - """ - logger.warning_once( - "`post_process_panoptic is deprecated and will be removed in v5 of Transformers, please use" - " `post_process_panoptic_segmentation`.", - ) - if target_sizes is None: - target_sizes = processed_sizes - if len(processed_sizes) != len(target_sizes): - raise ValueError("Make sure to pass in as many processed_sizes as target_sizes") - - if is_thing_map is None: - # default to is_thing_map of COCO panoptic - is_thing_map = {i: i <= 90 for i in range(201)} - - out_logits, raw_masks, raw_boxes = outputs.logits, outputs.pred_masks, outputs.pred_boxes - if not len(out_logits) == len(raw_masks) == len(target_sizes): - raise ValueError( - "Make sure that you pass in as many target sizes as the batch dimension of the logits and masks" - ) - empty_label = out_logits.shape[-1] - 1 - preds = [] - - def to_tuple(tup): - if isinstance(tup, tuple): - return tup - return tuple(tup.tolist()) - - for cur_logits, cur_masks, cur_boxes, size, target_size in zip( - out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes - ): - # we filter empty queries and detection below threshold - cur_scores, cur_labels = cur_logits.softmax(-1).max(-1) - keep = cur_labels.ne(empty_label) & (cur_scores > threshold) - cur_scores = cur_scores[keep] - cur_labels = cur_labels[keep] - cur_masks = cur_masks[keep] - cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1) - cur_boxes = center_to_corners_format(cur_boxes[keep]) - - h, w = cur_masks.shape[-2:] - if len(cur_boxes) != len(cur_labels): - raise ValueError("Not as many boxes as there are classes") - - # It may be that we have several predicted masks for the same stuff class. - # In the following, we track the list of masks ids for each stuff class (they are merged later on) - cur_masks = cur_masks.flatten(1) - stuff_equiv_classes = defaultdict(list) - for k, label in enumerate(cur_labels): - if not is_thing_map[label.item()]: - stuff_equiv_classes[label.item()].append(k) - - def get_ids_area(masks, scores, dedup=False): - # This helper function creates the final panoptic segmentation image - # It also returns the area of the masks that appears on the image - - m_id = masks.transpose(0, 1).softmax(-1) - - if m_id.shape[-1] == 0: - # We didn't detect any mask :( - m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device) - else: - m_id = m_id.argmax(-1).view(h, w) - - if dedup: - # Merge the masks corresponding to the same stuff class - for equiv in stuff_equiv_classes.values(): - if len(equiv) > 1: - for eq_id in equiv: - m_id.masked_fill_(m_id.eq(eq_id), equiv[0]) - - final_h, final_w = to_tuple(target_size) - - seg_img = PIL.Image.fromarray(id_to_rgb(m_id.view(h, w).cpu().numpy())) - seg_img = seg_img.resize(size=(final_w, final_h), resample=PILImageResampling.NEAREST) - - np_seg_img = torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes())) - np_seg_img = np_seg_img.view(final_h, final_w, 3) - np_seg_img = np_seg_img.numpy() - - m_id = torch.from_numpy(rgb_to_id(np_seg_img)) - - area = [] - for i in range(len(scores)): - area.append(m_id.eq(i).sum().item()) - return area, seg_img - - area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True) - if cur_labels.numel() > 0: - # We know filter empty masks as long as we find some - while True: - filtered_small = torch.as_tensor( - [area[i] <= 4 for i, c in enumerate(cur_labels)], dtype=torch.bool, device=keep.device - ) - if filtered_small.any().item(): - cur_scores = cur_scores[~filtered_small] - cur_labels = cur_labels[~filtered_small] - cur_masks = cur_masks[~filtered_small] - area, seg_img = get_ids_area(cur_masks, cur_scores) - else: - break - - else: - cur_labels = torch.ones(1, dtype=torch.long, device=cur_labels.device) - - segments_info = [] - for i, a in enumerate(area): - cat = cur_labels[i].item() - segments_info.append({"id": i, "isthing": is_thing_map[cat], "category_id": cat, "area": a}) - del cur_labels - - with io.BytesIO() as out: - seg_img.save(out, format="PNG") - predictions = {"png_string": out.getvalue(), "segments_info": segments_info} - preds.append(predictions) - return preds - # inspired by https://github.com/facebookresearch/detr/blob/master/models/detr.py#L258 def post_process_object_detection( self, outputs, threshold: float = 0.5, target_sizes: Optional[Union[TensorType, list[tuple]]] = None diff --git a/src/transformers/models/detr/image_processing_detr_fast.py b/src/transformers/models/detr/image_processing_detr_fast.py index 360144e983b2..61a68f74267a 100644 --- a/src/transformers/models/detr/image_processing_detr_fast.py +++ b/src/transformers/models/detr/image_processing_detr_fast.py @@ -14,12 +14,9 @@ # limitations under the License. """Fast Image processor class for DETR.""" -import io import pathlib -from collections import defaultdict from typing import Any, Optional, Union -import PIL import torch from torch import nn from torchvision.io import read_image @@ -33,14 +30,13 @@ get_max_height_width, safe_squeeze, ) -from ...image_transforms import center_to_corners_format, corners_to_center_format, id_to_rgb +from ...image_transforms import center_to_corners_format, corners_to_center_format from ...image_utils import ( IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, AnnotationFormat, AnnotationType, ChannelDimension, - ImageInput, PILImageResampling, get_image_size, validate_annotations, @@ -280,19 +276,10 @@ class DetrImageProcessorFast(BaseImageProcessorFast): valid_kwargs = DetrImageProcessorKwargs def __init__(self, **kwargs: Unpack[DetrImageProcessorKwargs]) -> None: - if "pad_and_return_pixel_mask" in kwargs: - kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") + kwargs.setdefault("do_pad", kwargs.pop("pad_and_return_pixel_mask", self.do_pad)) size = kwargs.pop("size", None) - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` parameter is deprecated and will be removed in v4.26. " - "Please specify in `size['longest_edge'] instead`.", - ) - max_size = kwargs.pop("max_size") - else: - max_size = None if size is None else 1333 - + max_size = None if size is None else kwargs.pop("max_size", 1333) size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} self.size = get_size_dict(size, max_size=max_size, default_to_square=False) @@ -304,20 +291,6 @@ def __init__(self, **kwargs: Unpack[DetrImageProcessorKwargs]) -> None: super().__init__(**kwargs) - @classmethod - def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): - """ - Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is - created using from_dict and kwargs e.g. `DetrImageProcessorFast.from_pretrained(checkpoint, size=600, - max_size=800)` - """ - image_processor_dict = image_processor_dict.copy() - if "max_size" in kwargs: - image_processor_dict["max_size"] = kwargs.pop("max_size") - if "pad_and_return_pixel_mask" in kwargs: - image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") - return super().from_dict(image_processor_dict, **kwargs) - def prepare_annotation( self, image: torch.Tensor, @@ -537,28 +510,6 @@ def pad( return image, pixel_mask, annotation - @auto_docstring - def preprocess( - self, - images: ImageInput, - **kwargs: Unpack[DetrImageProcessorKwargs], - ) -> BatchFeature: - if "pad_and_return_pixel_mask" in kwargs: - kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") - logger.warning_once( - "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, " - "use `do_pad` instead." - ) - - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` argument is deprecated and will be removed in a future version, use" - " `size['longest_edge']` instead." - ) - kwargs["size"] = kwargs.pop("max_size") - - return super().preprocess(images, **kwargs) - def _preprocess( self, images: list["torch.Tensor"], @@ -675,277 +626,6 @@ def _preprocess( ] return encoded_inputs - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process - def post_process(self, outputs, target_sizes): - """ - Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, - bottom_right_x, bottom_right_y) format. - - Args: - outputs ([`DetrObjectDetectionOutput`]): - Raw outputs of the model. - target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): - Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the - original image size (before any data augmentation). For visualization, this should be the image size - after data augment, but before padding. - Returns: - `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image - in the batch as predicted by the model. - """ - logger.warning_once( - "`post_process` is deprecated and will be removed in v5 of Transformers, please use" - " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", - ) - - out_logits, out_bbox = outputs.logits, outputs.pred_boxes - - if len(out_logits) != len(target_sizes): - raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") - if target_sizes.shape[1] != 2: - raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") - - prob = nn.functional.softmax(out_logits, -1) - scores, labels = prob[..., :-1].max(-1) - - # convert to [x0, y0, x1, y1] format - boxes = center_to_corners_format(out_bbox) - # and from relative [0, 1] to absolute [0, height] coordinates - img_h, img_w = target_sizes.unbind(1) - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) - boxes = boxes * scale_fct[:, None, :] - - results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] - return results - - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_segmentation - def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5): - """ - Converts the output of [`DetrForSegmentation`] into image segmentation predictions. Only supports PyTorch. - - Args: - outputs ([`DetrSegmentationOutput`]): - Raw outputs of the model. - target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `list[Tuple]` of length `batch_size`): - Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction. - threshold (`float`, *optional*, defaults to 0.9): - Threshold to use to filter out queries. - mask_threshold (`float`, *optional*, defaults to 0.5): - Threshold to use when turning the predicted masks into binary values. - Returns: - `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an image - in the batch as predicted by the model. - """ - logger.warning_once( - "`post_process_segmentation` is deprecated and will be removed in v5 of Transformers, please use" - " `post_process_semantic_segmentation`.", - ) - out_logits, raw_masks = outputs.logits, outputs.pred_masks - empty_label = out_logits.shape[-1] - 1 - preds = [] - - def to_tuple(tup): - if isinstance(tup, tuple): - return tup - return tuple(tup.tolist()) - - for cur_logits, cur_masks, size in zip(out_logits, raw_masks, target_sizes): - # we filter empty queries and detection below threshold - cur_scores, cur_labels = cur_logits.softmax(-1).max(-1) - keep = cur_labels.ne(empty_label) & (cur_scores > threshold) - cur_scores = cur_scores[keep] - cur_labels = cur_labels[keep] - cur_masks = cur_masks[keep] - cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1) - cur_masks = (cur_masks.sigmoid() > mask_threshold) * 1 - - predictions = {"scores": cur_scores, "labels": cur_labels, "masks": cur_masks} - preds.append(predictions) - return preds - - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_instance - def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5): - """ - Converts the output of [`DetrForSegmentation`] into actual instance segmentation predictions. Only supports - PyTorch. - - Args: - results (`list[Dict]`): - Results list obtained by [`~DetrImageProcessor.post_process`], to which "masks" results will be added. - outputs ([`DetrSegmentationOutput`]): - Raw outputs of the model. - orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): - Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original - image size (before any data augmentation). - max_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): - Tensor containing the maximum size (h, w) of each image of the batch. For evaluation, this must be the - original image size (before any data augmentation). - threshold (`float`, *optional*, defaults to 0.5): - Threshold to use when turning the predicted masks into binary values. - Returns: - `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks for an - image in the batch as predicted by the model. - """ - logger.warning_once( - "`post_process_instance` is deprecated and will be removed in v5 of Transformers, please use" - " `post_process_instance_segmentation`.", - ) - - if len(orig_target_sizes) != len(max_target_sizes): - raise ValueError("Make sure to pass in as many orig_target_sizes as max_target_sizes") - max_h, max_w = max_target_sizes.max(0)[0].tolist() - outputs_masks = outputs.pred_masks.squeeze(2) - outputs_masks = nn.functional.interpolate( - outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False - ) - outputs_masks = (outputs_masks.sigmoid() > threshold).cpu() - - for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)): - img_h, img_w = t[0], t[1] - results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1) - results[i]["masks"] = nn.functional.interpolate( - results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest" - ).byte() - - return results - - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_panoptic - def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85): - """ - Converts the output of [`DetrForSegmentation`] into actual panoptic predictions. Only supports PyTorch. - - Args: - outputs ([`DetrSegmentationOutput`]): - Raw outputs of the model. - processed_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `list[Tuple]` of length `batch_size`): - Torch Tensor (or list) containing the size (h, w) of each image of the batch, i.e. the size after data - augmentation but before batching. - target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `list[Tuple]` of length `batch_size`, *optional*): - Torch Tensor (or list) corresponding to the requested final size `(height, width)` of each prediction. - If left to None, it will default to the `processed_sizes`. - is_thing_map (`torch.Tensor` of shape `(batch_size, 2)`, *optional*): - Dictionary mapping class indices to either True or False, depending on whether or not they are a thing. - If not set, defaults to the `is_thing_map` of COCO panoptic. - threshold (`float`, *optional*, defaults to 0.85): - Threshold to use to filter out queries. - Returns: - `list[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values for - an image in the batch as predicted by the model. - """ - logger.warning_once( - "`post_process_panoptic is deprecated and will be removed in v5 of Transformers, please use" - " `post_process_panoptic_segmentation`.", - ) - if target_sizes is None: - target_sizes = processed_sizes - if len(processed_sizes) != len(target_sizes): - raise ValueError("Make sure to pass in as many processed_sizes as target_sizes") - - if is_thing_map is None: - # default to is_thing_map of COCO panoptic - is_thing_map = {i: i <= 90 for i in range(201)} - - out_logits, raw_masks, raw_boxes = outputs.logits, outputs.pred_masks, outputs.pred_boxes - if not len(out_logits) == len(raw_masks) == len(target_sizes): - raise ValueError( - "Make sure that you pass in as many target sizes as the batch dimension of the logits and masks" - ) - empty_label = out_logits.shape[-1] - 1 - preds = [] - - def to_tuple(tup): - if isinstance(tup, tuple): - return tup - return tuple(tup.tolist()) - - for cur_logits, cur_masks, cur_boxes, size, target_size in zip( - out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes - ): - # we filter empty queries and detection below threshold - cur_scores, cur_labels = cur_logits.softmax(-1).max(-1) - keep = cur_labels.ne(empty_label) & (cur_scores > threshold) - cur_scores = cur_scores[keep] - cur_labels = cur_labels[keep] - cur_masks = cur_masks[keep] - cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1) - cur_boxes = center_to_corners_format(cur_boxes[keep]) - - h, w = cur_masks.shape[-2:] - if len(cur_boxes) != len(cur_labels): - raise ValueError("Not as many boxes as there are classes") - - # It may be that we have several predicted masks for the same stuff class. - # In the following, we track the list of masks ids for each stuff class (they are merged later on) - cur_masks = cur_masks.flatten(1) - stuff_equiv_classes = defaultdict(list) - for k, label in enumerate(cur_labels): - if not is_thing_map[label.item()]: - stuff_equiv_classes[label.item()].append(k) - - def get_ids_area(masks, scores, dedup=False): - # This helper function creates the final panoptic segmentation image - # It also returns the area of the masks that appears on the image - - m_id = masks.transpose(0, 1).softmax(-1) - - if m_id.shape[-1] == 0: - # We didn't detect any mask :( - m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device) - else: - m_id = m_id.argmax(-1).view(h, w) - - if dedup: - # Merge the masks corresponding to the same stuff class - for equiv in stuff_equiv_classes.values(): - if len(equiv) > 1: - for eq_id in equiv: - m_id.masked_fill_(m_id.eq(eq_id), equiv[0]) - - final_h, final_w = to_tuple(target_size) - - seg_img = PIL.Image.fromarray(id_to_rgb(m_id.view(h, w).cpu().numpy())) - seg_img = seg_img.resize(size=(final_w, final_h), resample=PILImageResampling.NEAREST) - - np_seg_img = torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes())) - np_seg_img = np_seg_img.view(final_h, final_w, 3) - np_seg_img = np_seg_img.numpy() - - m_id = torch.from_numpy(rgb_to_id(np_seg_img)) - - area = [] - for i in range(len(scores)): - area.append(m_id.eq(i).sum().item()) - return area, seg_img - - area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True) - if cur_labels.numel() > 0: - # We know filter empty masks as long as we find some - while True: - filtered_small = torch.as_tensor( - [area[i] <= 4 for i, c in enumerate(cur_labels)], dtype=torch.bool, device=keep.device - ) - if filtered_small.any().item(): - cur_scores = cur_scores[~filtered_small] - cur_labels = cur_labels[~filtered_small] - cur_masks = cur_masks[~filtered_small] - area, seg_img = get_ids_area(cur_masks, cur_scores) - else: - break - - else: - cur_labels = torch.ones(1, dtype=torch.long, device=cur_labels.device) - - segments_info = [] - for i, a in enumerate(area): - cat = cur_labels[i].item() - segments_info.append({"id": i, "isthing": is_thing_map[cat], "category_id": cat, "area": a}) - del cur_labels - - with io.BytesIO() as out: - seg_img.save(out, format="PNG") - predictions = {"png_string": out.getvalue(), "segments_info": segments_info} - preds.append(predictions) - return preds - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_object_detection def post_process_object_detection( self, outputs, threshold: float = 0.5, target_sizes: Optional[Union[TensorType, list[tuple]]] = None diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py index eb21ea3b376e..b3db644ed518 100644 --- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py @@ -868,18 +868,7 @@ def __init__( pad_size: Optional[dict[str, int]] = None, **kwargs, ) -> None: - if "pad_and_return_pixel_mask" in kwargs: - do_pad = kwargs.pop("pad_and_return_pixel_mask") - - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` parameter is deprecated and will be removed in v4.26. " - "Please specify in `size['longest_edge'] instead`.", - ) - max_size = kwargs.pop("max_size") - else: - max_size = None if size is None else 1333 - + max_size = None if size is None else kwargs.pop("max_size", 1333) size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} size = get_size_dict(size, max_size=max_size, default_to_square=False) @@ -898,7 +887,7 @@ def __init__( self.do_convert_annotations = do_convert_annotations self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD - self.do_pad = do_pad + self.do_pad = kwargs.pop("pad_and_return_pixel_mask", do_pad) self.pad_size = pad_size self._valid_processor_keys = [ "images", @@ -922,21 +911,6 @@ def __init__( "input_data_format", ] - @classmethod - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->GroundingDino - def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): - """ - Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is - created using from_dict and kwargs e.g. `GroundingDinoImageProcessor.from_pretrained(checkpoint, size=600, - max_size=800)` - """ - image_processor_dict = image_processor_dict.copy() - if "max_size" in kwargs: - image_processor_dict["max_size"] = kwargs.pop("max_size") - if "pad_and_return_pixel_mask" in kwargs: - image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") - return super().from_dict(image_processor_dict, **kwargs) - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->GroundingDino def prepare_annotation( self, @@ -1005,15 +979,7 @@ def resize( input_data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format of the input image. If not provided, it will be inferred. """ - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` parameter is deprecated and will be removed in v4.26. " - "Please specify in `size['longest_edge'] instead`.", - ) - max_size = kwargs.pop("max_size") - else: - max_size = None - size = get_size_dict(size, max_size=max_size, default_to_square=False) + size = get_size_dict(size, max_size=None, default_to_square=False) if "shortest_edge" in size and "longest_edge" in size: new_size = get_resize_output_image_size( image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format @@ -1350,19 +1316,6 @@ def preprocess( provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest height and width in the batch. """ - if "pad_and_return_pixel_mask" in kwargs: - logger.warning_once( - "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, " - "use `do_pad` instead." - ) - do_pad = kwargs.pop("pad_and_return_pixel_mask") - - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` argument is deprecated and will be removed in a future version, use" - " `size['longest_edge']` instead." - ) - size = kwargs.pop("max_size") do_resize = self.do_resize if do_resize is None else do_resize size = self.size if size is None else size diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py index ee303ec47fc4..bb911ac27b86 100644 --- a/src/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py @@ -46,13 +46,12 @@ AnnotationFormat, AnnotationType, ChannelDimension, - ImageInput, PILImageResampling, get_image_size, validate_annotations, ) from ...processing_utils import Unpack -from ...utils import TensorType, auto_docstring, logging +from ...utils import TensorType, auto_docstring from ...utils.import_utils import requires from .image_processing_grounding_dino import GroundingDinoImageProcessorKwargs, get_size_with_aspect_ratio @@ -60,9 +59,6 @@ if TYPE_CHECKING: from .modeling_grounding_dino import GroundingDinoObjectDetectionOutput - -logger = logging.get_logger(__name__) - SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) @@ -308,19 +304,10 @@ class GroundingDinoImageProcessorFast(BaseImageProcessorFast): valid_kwargs = GroundingDinoImageProcessorKwargs def __init__(self, **kwargs: Unpack[GroundingDinoImageProcessorKwargs]) -> None: - if "pad_and_return_pixel_mask" in kwargs: - kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") + kwargs.setdefault("do_pad", kwargs.pop("pad_and_return_pixel_mask", self.do_pad)) size = kwargs.pop("size", None) - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` parameter is deprecated and will be removed in v4.26. " - "Please specify in `size['longest_edge'] instead`.", - ) - max_size = kwargs.pop("max_size") - else: - max_size = None if size is None else 1333 - + max_size = None if size is None else kwargs.pop("max_size", 1333) size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} self.size = get_size_dict(size, max_size=max_size, default_to_square=False) @@ -332,20 +319,6 @@ def __init__(self, **kwargs: Unpack[GroundingDinoImageProcessorKwargs]) -> None: super().__init__(**kwargs) - @classmethod - def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): - """ - Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is - created using from_dict and kwargs e.g. `GroundingDinoImageProcessorFast.from_pretrained(checkpoint, size=600, - max_size=800)` - """ - image_processor_dict = image_processor_dict.copy() - if "max_size" in kwargs: - image_processor_dict["max_size"] = kwargs.pop("max_size") - if "pad_and_return_pixel_mask" in kwargs: - image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") - return super().from_dict(image_processor_dict, **kwargs) - def prepare_annotation( self, image: torch.Tensor, @@ -565,28 +538,6 @@ def pad( return image, pixel_mask, annotation - @auto_docstring - def preprocess( - self, - images: ImageInput, - **kwargs: Unpack[GroundingDinoImageProcessorKwargs], - ) -> BatchFeature: - if "pad_and_return_pixel_mask" in kwargs: - kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") - logger.warning_once( - "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, " - "use `do_pad` instead." - ) - - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` argument is deprecated and will be removed in a future version, use" - " `size['longest_edge']` instead." - ) - kwargs["size"] = kwargs.pop("max_size") - - return super().preprocess(images, **kwargs) - def _preprocess( self, images: list["torch.Tensor"], diff --git a/src/transformers/models/grounding_dino/modular_grounding_dino.py b/src/transformers/models/grounding_dino/modular_grounding_dino.py index ded6435508a5..2ea1a4e43930 100644 --- a/src/transformers/models/grounding_dino/modular_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modular_grounding_dino.py @@ -120,18 +120,6 @@ def post_process_object_detection( return results - def post_process(self): - raise NotImplementedError("Post-processing is not implemented for Grounding-Dino yet.") - - def post_process_segmentation(self): - raise NotImplementedError("Segmentation post-processing is not implemented for Grounding-Dino yet.") - - def post_process_instance(self): - raise NotImplementedError("Instance post-processing is not implemented for Grounding-Dino yet.") - - def post_process_panoptic(self): - raise NotImplementedError("Panoptic post-processing is not implemented for Grounding-Dino yet.") - def post_process_instance_segmentation(self): raise NotImplementedError("Segmentation post-processing is not implemented for Grounding-Dino yet.") diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index 9729e481f402..31be18d2ef9d 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -19,7 +19,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import warnings from collections.abc import Callable from typing import Optional, Union @@ -1013,18 +1012,6 @@ def tie_weights(self): elif target_lang is not None: self.load_adapter(target_lang, force_load=True) - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameter will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will @@ -1136,18 +1123,6 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameters will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will diff --git a/src/transformers/models/hubert/modular_hubert.py b/src/transformers/models/hubert/modular_hubert.py index a0a7d805c973..13889248bc8d 100644 --- a/src/transformers/models/hubert/modular_hubert.py +++ b/src/transformers/models/hubert/modular_hubert.py @@ -213,9 +213,6 @@ def __init__(self, config: HubertConfig): del self.adapter - def freeze_feature_extractor(self): - raise AttributeError("Not needed for Hubert") - def freeze_feature_encoder(self): raise AttributeError("Not needed for Hubert") diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py index 7cb640e56854..7e128e4a2350 100644 --- a/src/transformers/models/idefics/processing_idefics.py +++ b/src/transformers/models/idefics/processing_idefics.py @@ -29,7 +29,6 @@ ) from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import is_torch_available -from ...utils.deprecation import deprecate_kwarg if is_torch_available(): @@ -171,7 +170,6 @@ def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_u "" in self.tokenizer.special_tokens_map.get("additional_special_tokens", []) ) - @deprecate_kwarg(old_name="prompts", version="5.0.0", new_name="text", raise_if_both_names=True) def __call__( self, images: Union[ImageInput, list[ImageInput], str, list[str], list[list[str]]] = None, diff --git a/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py index e3f9824de41d..eaf7c6f082bf 100644 --- a/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py @@ -644,8 +644,6 @@ def forward( return attn_output, attn_weights -# NO LONGER EXIST Copied from transformers.models.gemma.modeling_gemma.GemmaSdpaAttention with Gemma->KyutaiSpeechToText -# TODO cyril: modular class KyutaiSpeechToTextSdpaAttention(KyutaiSpeechToTextAttention): """ KyutaiSpeechToText attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from @@ -666,21 +664,10 @@ def forward( **kwargs, ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: if output_attentions: - # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. logger.warning_once( - "KyutaiSpeechToTextModel is using KyutaiSpeechToTextSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " - 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - return super().forward( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - output_attentions=output_attentions, - use_cache=use_cache, - cache_position=cache_position, + f"{self.__class__.__name__} does not support `output_attentions=True`. The returned attention weights will " + "be `None`. If you want to get attention weights, please set `attn_implementation='eager'` when loading the model." ) - bsz, q_len, _ = hidden_states.size() query_states = self.q_proj(hidden_states, cache_position) # Ignore copy diff --git a/src/transformers/models/llama4/configuration_llama4.py b/src/transformers/models/llama4/configuration_llama4.py index a37301a17741..88a0f4f82e53 100644 --- a/src/transformers/models/llama4/configuration_llama4.py +++ b/src/transformers/models/llama4/configuration_llama4.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import warnings from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation @@ -133,19 +132,6 @@ def __init__( rope_theta = kwargs.get("rope_theta", 10000.0) standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) - - @property - def vision_feature_layer(self): - warnings.warn( - "The `vision_feature_layer` attribute is deprecated and will be removed in v4.58.0.", - FutureWarning, - ) - return self._vision_feature_layer - - @vision_feature_layer.setter - def vision_feature_layer(self, value): - self._vision_feature_layer = value - super().__init__(**kwargs) diff --git a/src/transformers/models/llama4/modeling_llama4.py b/src/transformers/models/llama4/modeling_llama4.py index 6b012a5b096a..f6239f891285 100644 --- a/src/transformers/models/llama4/modeling_llama4.py +++ b/src/transformers/models/llama4/modeling_llama4.py @@ -39,7 +39,6 @@ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging -from ...utils.deprecation import deprecate_kwarg from ...utils.generic import check_model_inputs from .configuration_llama4 import Llama4Config, Llama4TextConfig @@ -1258,7 +1257,6 @@ def get_placeholder_mask( return special_image_mask @auto_docstring - @deprecate_kwarg("vision_feature_layer", version="4.58") def forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -1267,7 +1265,6 @@ def forward( position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, inputs_embeds: Optional[torch.FloatTensor] = None, - vision_feature_layer: Optional[Union[int, list[int]]] = None, vision_feature_select_strategy: Optional[str] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, diff --git a/src/transformers/models/mask2former/modular_mask2former.py b/src/transformers/models/mask2former/modular_mask2former.py index c5f3f58fedbb..9d6a021a6155 100644 --- a/src/transformers/models/mask2former/modular_mask2former.py +++ b/src/transformers/models/mask2former/modular_mask2former.py @@ -305,8 +305,5 @@ def post_process_panoptic_segmentation( results.append({"segmentation": segmentation, "segments_info": segments}) return results - def post_process_segmentation(self): - raise NotImplementedError("Segmentation post-processing is not implemented for Mask2Former yet.") - __all__ = ["Mask2FormerImageProcessorFast"] diff --git a/src/transformers/models/maskformer/image_processing_maskformer.py b/src/transformers/models/maskformer/image_processing_maskformer.py index 7d83809ced66..586913c36f68 100644 --- a/src/transformers/models/maskformer/image_processing_maskformer.py +++ b/src/transformers/models/maskformer/image_processing_maskformer.py @@ -15,9 +15,8 @@ """Image processor class for MaskFormer.""" import math -import warnings from collections.abc import Iterable -from typing import TYPE_CHECKING, Any, Optional, Union +from typing import Any, Optional, Union import numpy as np @@ -58,10 +57,6 @@ logger = logging.get_logger(__name__) -if TYPE_CHECKING: - from transformers import MaskFormerForInstanceSegmentationOutput - - if is_torch_available(): import torch from torch import nn @@ -989,54 +984,6 @@ def encode_inputs( return encoded_inputs - def post_process_segmentation( - self, outputs: "MaskFormerForInstanceSegmentationOutput", target_size: Optional[tuple[int, int]] = None - ) -> "torch.Tensor": - """ - Converts the output of [`MaskFormerForInstanceSegmentationOutput`] into image segmentation predictions. Only - supports PyTorch. - - Args: - outputs ([`MaskFormerForInstanceSegmentationOutput`]): - The outputs from [`MaskFormerForInstanceSegmentation`]. - - target_size (`tuple[int, int]`, *optional*): - If set, the `masks_queries_logits` will be resized to `target_size`. - - Returns: - `torch.Tensor`: - A tensor of shape (`batch_size, num_class_labels, height, width`). - """ - warnings.warn( - "`post_process_segmentation` is deprecated and will be removed in v5 of Transformers, please use" - " `post_process_instance_segmentation`", - FutureWarning, - ) - - # class_queries_logits has shape [BATCH, QUERIES, CLASSES + 1] - class_queries_logits = outputs.class_queries_logits - # masks_queries_logits has shape [BATCH, QUERIES, HEIGHT, WIDTH] - masks_queries_logits = outputs.masks_queries_logits - if target_size is not None: - masks_queries_logits = torch.nn.functional.interpolate( - masks_queries_logits, - size=target_size, - mode="bilinear", - align_corners=False, - ) - # remove the null class `[..., :-1]` - masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1] - # mask probs has shape [BATCH, QUERIES, HEIGHT, WIDTH] - masks_probs = masks_queries_logits.sigmoid() - # now we want to sum over the queries, - # $ out_{c,h,w} = \sum_q p_{q,c} * m_{q,h,w} $ - # where $ softmax(p) \in R^{q, c} $ is the mask classes - # and $ sigmoid(m) \in R^{q, h, w}$ is the mask probabilities - # b(atch)q(uery)c(lasses), b(atch)q(uery)h(eight)w(idth) - segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs) - - return segmentation - def post_process_semantic_segmentation( self, outputs, target_sizes: Optional[list[tuple[int, int]]] = None ) -> "torch.Tensor": diff --git a/src/transformers/models/maskformer/image_processing_maskformer_fast.py b/src/transformers/models/maskformer/image_processing_maskformer_fast.py index 59570545e5bd..2be8ca8f16a9 100644 --- a/src/transformers/models/maskformer/image_processing_maskformer_fast.py +++ b/src/transformers/models/maskformer/image_processing_maskformer_fast.py @@ -15,7 +15,6 @@ """Fast Image processor class for MaskFormer.""" import math -import warnings from typing import TYPE_CHECKING, Any, Optional, Union import torch @@ -58,7 +57,7 @@ if TYPE_CHECKING: - from transformers import MaskFormerForInstanceSegmentationOutput + pass def convert_segmentation_map_to_binary_masks_fast( @@ -406,55 +405,6 @@ def _preprocess( return encoded_inputs - # Copied from transformers.models.maskformer.image_processing_maskformer.MaskFormerImageProcessor.post_process_segmentation - def post_process_segmentation( - self, outputs: "MaskFormerForInstanceSegmentationOutput", target_size: Optional[tuple[int, int]] = None - ) -> "torch.Tensor": - """ - Converts the output of [`MaskFormerForInstanceSegmentationOutput`] into image segmentation predictions. Only - supports PyTorch. - - Args: - outputs ([`MaskFormerForInstanceSegmentationOutput`]): - The outputs from [`MaskFormerForInstanceSegmentation`]. - - target_size (`tuple[int, int]`, *optional*): - If set, the `masks_queries_logits` will be resized to `target_size`. - - Returns: - `torch.Tensor`: - A tensor of shape (`batch_size, num_class_labels, height, width`). - """ - warnings.warn( - "`post_process_segmentation` is deprecated and will be removed in v5 of Transformers, please use" - " `post_process_instance_segmentation`", - FutureWarning, - ) - - # class_queries_logits has shape [BATCH, QUERIES, CLASSES + 1] - class_queries_logits = outputs.class_queries_logits - # masks_queries_logits has shape [BATCH, QUERIES, HEIGHT, WIDTH] - masks_queries_logits = outputs.masks_queries_logits - if target_size is not None: - masks_queries_logits = torch.nn.functional.interpolate( - masks_queries_logits, - size=target_size, - mode="bilinear", - align_corners=False, - ) - # remove the null class `[..., :-1]` - masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1] - # mask probs has shape [BATCH, QUERIES, HEIGHT, WIDTH] - masks_probs = masks_queries_logits.sigmoid() - # now we want to sum over the queries, - # $ out_{c,h,w} = \sum_q p_{q,c} * m_{q,h,w} $ - # where $ softmax(p) \in R^{q, c} $ is the mask classes - # and $ sigmoid(m) \in R^{q, h, w}$ is the mask probabilities - # b(atch)q(uery)c(lasses), b(atch)q(uery)h(eight)w(idth) - segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs) - - return segmentation - # Copied from transformers.models.maskformer.image_processing_maskformer.MaskFormerImageProcessor.post_process_semantic_segmentation def post_process_semantic_segmentation( self, outputs, target_sizes: Optional[list[tuple[int, int]]] = None diff --git a/src/transformers/models/mimi/modeling_mimi.py b/src/transformers/models/mimi/modeling_mimi.py index 83bcbd857a0d..707fc6599d8b 100644 --- a/src/transformers/models/mimi/modeling_mimi.py +++ b/src/transformers/models/mimi/modeling_mimi.py @@ -870,21 +870,10 @@ def forward( **kwargs, ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: if output_attentions: - # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. logger.warning_once( - "MimiModel is using MimiSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " - 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + f"{self.__class__.__name__} does not support `output_attentions=True`. The returned attention weights will " + "be `None`. If you want to get attention weights, please set `attn_implementation='eager'` when loading the model." ) - return super().forward( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - output_attentions=output_attentions, - use_cache=use_cache, - cache_position=cache_position, - ) - bsz, q_len, _ = hidden_states.size() query_states = self.q_proj(hidden_states) diff --git a/src/transformers/models/moshi/modeling_moshi.py b/src/transformers/models/moshi/modeling_moshi.py index 01c89ecb52cc..6b0dcdf6b103 100644 --- a/src/transformers/models/moshi/modeling_moshi.py +++ b/src/transformers/models/moshi/modeling_moshi.py @@ -643,8 +643,7 @@ def forward( return attn_output, attn_weights -# NO LONGER EXIST Copied from transformers.models.gemma.modeling_gemma.GemmaSdpaAttention with Gemma->Moshi -# TODO cyril: modular +# Copied from transformers.models.mimi.modeling_mimi.MimiSdpaAttention with Mimi->Moshi class MoshiSdpaAttention(MoshiAttention): """ Moshi attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from @@ -665,21 +664,10 @@ def forward( **kwargs, ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: if output_attentions: - # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. logger.warning_once( - "MoshiModel is using MoshiSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " - 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + f"{self.__class__.__name__} does not support `output_attentions=True`. The returned attention weights will " + "be `None`. If you want to get attention weights, please set `attn_implementation='eager'` when loading the model." ) - return super().forward( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - output_attentions=output_attentions, - use_cache=use_cache, - cache_position=cache_position, - ) - bsz, q_len, _ = hidden_states.size() query_states = self.q_proj(hidden_states, cache_position) # Ignore copy diff --git a/src/transformers/models/nemotron/modeling_nemotron.py b/src/transformers/models/nemotron/modeling_nemotron.py index 1c8c7eca861f..8448c55444f9 100644 --- a/src/transformers/models/nemotron/modeling_nemotron.py +++ b/src/transformers/models/nemotron/modeling_nemotron.py @@ -454,22 +454,10 @@ def forward( **kwargs, ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: if output_attentions: - # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. logger.warning_once( - "NemotronModel is using NemotronSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " - 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + f"{self.__class__.__name__} does not support `output_attentions=True`. The returned attention weights will " + "be `None`. If you want to get attention weights, please set `attn_implementation='eager'` when loading the model." ) - return super().forward( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - output_attentions=output_attentions, - use_cache=use_cache, - cache_position=cache_position, - position_embeddings=position_embeddings, - ) - bsz, q_len, _ = hidden_states.size() query_states = self.q_proj(hidden_states) diff --git a/src/transformers/models/owlv2/image_processing_owlv2_fast.py b/src/transformers/models/owlv2/image_processing_owlv2_fast.py index f1a8a79fb81e..d31173c997c4 100644 --- a/src/transformers/models/owlv2/image_processing_owlv2_fast.py +++ b/src/transformers/models/owlv2/image_processing_owlv2_fast.py @@ -53,51 +53,6 @@ class Owlv2ImageProcessorFast(BaseImageProcessorFast): rescale_factor = 1 / 255 do_pad = True - def post_process(self, outputs, target_sizes): - """ - Converts the raw output of [`Owlv2ForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, - bottom_right_x, bottom_right_y) format. - - Args: - outputs ([`Owlv2ObjectDetectionOutput`]): - Raw outputs of the model. - target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): - Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original - image size (before any data augmentation). For visualization, this should be the image size after data - augment, but before padding. - Returns: - `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image - in the batch as predicted by the model. - """ - warnings.warn( - "`post_process` is deprecated and will be removed in v5 of Transformers, please use" - " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", - FutureWarning, - ) - - logits, boxes = outputs.logits, outputs.pred_boxes - - if len(logits) != len(target_sizes): - raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") - if target_sizes.shape[1] != 2: - raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") - - probs = torch.max(logits, dim=-1) - scores = torch.sigmoid(probs.values) - labels = probs.indices - - # Convert to [x0, y0, x1, y1] format - boxes = center_to_corners_format(boxes) - - # Convert from relative [0, 1] to absolute [0, height] coordinates - img_h, img_w = target_sizes.unbind(1) - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) - boxes = boxes * scale_fct[:, None, :] - - results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] - - return results - def post_process_object_detection( self, outputs: "Owlv2ObjectDetectionOutput", diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py index 52889721820f..57df27ef5a00 100644 --- a/src/transformers/models/owlv2/processing_owlv2.py +++ b/src/transformers/models/owlv2/processing_owlv2.py @@ -16,7 +16,6 @@ Image/Text processor class for OWLv2 """ -import warnings from typing import TYPE_CHECKING, Optional, Union import numpy as np @@ -170,19 +169,6 @@ def __call__( return BatchFeature(data=data, tensor_type=return_tensors) - # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_object_detection with OwlViT->Owlv2 - def post_process_object_detection(self, *args, **kwargs): - """ - This method forwards all its arguments to [`Owlv2ImageProcessor.post_process_object_detection`]. Please refer - to the docstring of this method for more information. - """ - warnings.warn( - "`post_process_object_detection` method is deprecated for OwlVitProcessor and will be removed in v5. " - "Use `post_process_grounded_object_detection` instead.", - FutureWarning, - ) - return self.image_processor.post_process_object_detection(*args, **kwargs) - # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_grounded_object_detection with OwlViT->Owlv2 def post_process_grounded_object_detection( self, diff --git a/src/transformers/models/owlvit/image_processing_owlvit.py b/src/transformers/models/owlvit/image_processing_owlvit.py index 42e3f10269b4..d72d1a933b59 100644 --- a/src/transformers/models/owlvit/image_processing_owlvit.py +++ b/src/transformers/models/owlvit/image_processing_owlvit.py @@ -14,7 +14,6 @@ # limitations under the License. """Image processor class for OwlViT""" -import warnings from typing import TYPE_CHECKING, Optional, Union import numpy as np @@ -440,51 +439,6 @@ def preprocess( encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) return encoded_inputs - def post_process(self, outputs, target_sizes): - """ - Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, - bottom_right_x, bottom_right_y) format. - - Args: - outputs ([`OwlViTObjectDetectionOutput`]): - Raw outputs of the model. - target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): - Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original - image size (before any data augmentation). For visualization, this should be the image size after data - augment, but before padding. - Returns: - `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image - in the batch as predicted by the model. - """ - warnings.warn( - "`post_process` is deprecated and will be removed in v5 of Transformers, please use" - " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", - FutureWarning, - ) - - logits, boxes = outputs.logits, outputs.pred_boxes - - if len(logits) != len(target_sizes): - raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") - if target_sizes.shape[1] != 2: - raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") - - probs = torch.max(logits, dim=-1) - scores = torch.sigmoid(probs.values) - labels = probs.indices - - # Convert to [x0, y0, x1, y1] format - boxes = center_to_corners_format(boxes) - - # Convert from relative [0, 1] to absolute [0, height] coordinates - img_h, img_w = target_sizes.unbind(1) - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) - boxes = boxes * scale_fct[:, None, :] - - results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] - - return results - def post_process_object_detection( self, outputs: "OwlViTObjectDetectionOutput", diff --git a/src/transformers/models/owlvit/image_processing_owlvit_fast.py b/src/transformers/models/owlvit/image_processing_owlvit_fast.py index 53d94313ece9..6e90d2bcb0be 100644 --- a/src/transformers/models/owlvit/image_processing_owlvit_fast.py +++ b/src/transformers/models/owlvit/image_processing_owlvit_fast.py @@ -14,7 +14,6 @@ # limitations under the License. """Fast Image processor class for OwlViT""" -import warnings from typing import TYPE_CHECKING, Optional, Union import torch @@ -48,52 +47,6 @@ class OwlViTImageProcessorFast(BaseImageProcessorFast): do_convert_rgb = None model_input_names = ["pixel_values"] - # Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process - def post_process(self, outputs, target_sizes): - """ - Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, - bottom_right_x, bottom_right_y) format. - - Args: - outputs ([`OwlViTObjectDetectionOutput`]): - Raw outputs of the model. - target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): - Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original - image size (before any data augmentation). For visualization, this should be the image size after data - augment, but before padding. - Returns: - `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image - in the batch as predicted by the model. - """ - warnings.warn( - "`post_process` is deprecated and will be removed in v5 of Transformers, please use" - " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", - FutureWarning, - ) - - logits, boxes = outputs.logits, outputs.pred_boxes - - if len(logits) != len(target_sizes): - raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") - if target_sizes.shape[1] != 2: - raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") - - probs = torch.max(logits, dim=-1) - scores = torch.sigmoid(probs.values) - labels = probs.indices - - # Convert to [x0, y0, x1, y1] format - boxes = center_to_corners_format(boxes) - - # Convert from relative [0, 1] to absolute [0, height] coordinates - img_h, img_w = target_sizes.unbind(1) - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) - boxes = boxes * scale_fct[:, None, :] - - results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] - - return results - # Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_object_detection def post_process_object_detection( self, diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py index 0443ab64eda9..3c6dd617c214 100644 --- a/src/transformers/models/owlvit/processing_owlvit.py +++ b/src/transformers/models/owlvit/processing_owlvit.py @@ -16,7 +16,6 @@ Image/Text processor class for OWL-ViT """ -import warnings from typing import TYPE_CHECKING, Optional, Union import numpy as np @@ -175,18 +174,6 @@ def post_process(self, *args, **kwargs): """ return self.image_processor.post_process(*args, **kwargs) - def post_process_object_detection(self, *args, **kwargs): - """ - This method forwards all its arguments to [`OwlViTImageProcessor.post_process_object_detection`]. Please refer - to the docstring of this method for more information. - """ - warnings.warn( - "`post_process_object_detection` method is deprecated for OwlVitProcessor and will be removed in v5. " - "Use `post_process_grounded_object_detection` instead.", - FutureWarning, - ) - return self.image_processor.post_process_object_detection(*args, **kwargs) - def post_process_grounded_object_detection( self, outputs: "OwlViTObjectDetectionOutput", diff --git a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py index 80b23721431d..bc6304817d59 100644 --- a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py @@ -1903,7 +1903,7 @@ def forward( >>> text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) >>> audios = [ librosa.load(BytesIO(urlopen( conversations[1]['content'][1]['audio_url'] ).read()), sr=self.processor.feature_extractor.sampling_rate) ] >>> images, videos = process_vision_info(conversations) - >>> inputs = processor(text=text, audios=audios, images=images, videos=videos, return_tensors="pt", padding=True) + >>> inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True) >>> # Generate >>> inputs['use_audio_in_video'] = `True` or `False` @@ -2349,7 +2349,7 @@ def forward( >>> url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3" >>> audio, _ = librosa.load(BytesIO(urlopen(url).read()), sr=self.processor.feature_extractor.sampling_rate) - >>> inputs = processor(text=prompt, audios=audio, return_tensors="pt") + >>> inputs = processor(text=prompt, audio=audio, return_tensors="pt") >>> # Generate >>> generate_ids = model.generate(**inputs, max_length=30) diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py index 329e1b798dd6..be35c6c3655e 100644 --- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py @@ -2267,7 +2267,7 @@ def forward( >>> text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) >>> audios = [ librosa.load(BytesIO(urlopen( conversations[1]['content'][1]['audio_url'] ).read()), sr=self.processor.feature_extractor.sampling_rate) ] >>> images, videos = process_vision_info(conversations) - >>> inputs = processor(text=text, audios=audios, images=images, videos=videos, return_tensors="pt", padding=True) + >>> inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True) >>> # Generate >>> inputs['use_audio_in_video'] = `True` or `False` @@ -2566,7 +2566,7 @@ def forward( >>> url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3" >>> audio, _ = librosa.load(BytesIO(urlopen(url).read()), sr=self.processor.feature_extractor.sampling_rate) - >>> inputs = processor(text=prompt, audios=audio, return_tensors="pt") + >>> inputs = processor(text=prompt, audio=audio, return_tensors="pt") >>> # Generate >>> generate_ids = model.generate(**inputs, max_length=30) diff --git a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py index 736d67b1a2ad..6429bf696553 100644 --- a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py +++ b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py @@ -561,7 +561,7 @@ def _merge_input_ids_with_audio_features( "[INST] <|AUDIO|>\nWhat is that in this audio? [/INST]", "[INST] <|AUDIO|>\nWhat is that in this audio? [/INST]", ] - inputs = processor(text=prompts, audios=[audio1, audio2], return_tensors='pt', padding=True).to("cuda") + inputs = processor(text=prompts, audio=[audio1, audio2], return_tensors='pt', padding=True).to("cuda") audio1 has 101 tokens, while audio2 has 72 tokens ``` @@ -735,7 +735,7 @@ def forward( >>> url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3" >>> audio, _ = librosa.load(BytesIO(urlopen(url).read()), sr=self.processor.feature_extractor.sampling_rate) - >>> inputs = processor(text=prompt, audios=audio, return_tensors="pt") + >>> inputs = processor(text=prompt, audio=audio, return_tensors="pt") >>> # Generate >>> generate_ids = model.generate(**inputs, max_length=30) diff --git a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py index aabd906dc3b2..4bb3bfe490f2 100644 --- a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py @@ -2050,7 +2050,7 @@ def forward( >>> text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) >>> audios = [ librosa.load(BytesIO(urlopen( conversations[1]['content'][1]['audio_url'] ).read()), sr=self.processor.feature_extractor.sampling_rate) ] >>> images, videos = process_vision_info(conversations) - >>> inputs = processor(text=text, audios=audios, images=images, videos=videos, return_tensors="pt", padding=True) + >>> inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True) >>> # Generate >>> inputs['use_audio_in_video'] = `True` or `False` diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr.py b/src/transformers/models/rt_detr/image_processing_rt_detr.py index 14993a5a5c9a..966051c535d7 100644 --- a/src/transformers/models/rt_detr/image_processing_rt_detr.py +++ b/src/transformers/models/rt_detr/image_processing_rt_detr.py @@ -486,15 +486,7 @@ def resize( input_data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format of the input image. If not provided, it will be inferred. """ - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` parameter is deprecated and will be removed in v4.26. " - "Please specify in `size['longest_edge'] instead`.", - ) - max_size = kwargs.pop("max_size") - else: - max_size = None - size = get_size_dict(size, max_size=max_size, default_to_square=False) + size = get_size_dict(size, max_size=None, default_to_square=False) if "shortest_edge" in size and "longest_edge" in size: new_size = get_resize_output_image_size( image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py index 44946eeed9e3..666844911249 100644 --- a/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py +++ b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py @@ -25,7 +25,6 @@ AnnotationFormat, AnnotationType, ChannelDimension, - ImageInput, PILImageResampling, get_image_size, validate_annotations, @@ -334,14 +333,6 @@ def pad( return image, pixel_mask, annotation - @auto_docstring - def preprocess( - self, - images: ImageInput, - **kwargs: Unpack[RTDetrImageProcessorKwargs], - ) -> BatchFeature: - return super().preprocess(images, **kwargs) - def _preprocess( self, images: list["torch.Tensor"], diff --git a/src/transformers/models/rt_detr/modular_rt_detr.py b/src/transformers/models/rt_detr/modular_rt_detr.py index 0f72fdd52845..3083bf975474 100644 --- a/src/transformers/models/rt_detr/modular_rt_detr.py +++ b/src/transformers/models/rt_detr/modular_rt_detr.py @@ -15,7 +15,6 @@ AnnotationFormat, AnnotationType, ChannelDimension, - ImageInput, PILImageResampling, get_image_size, validate_annotations, @@ -117,13 +116,6 @@ def __init__(self, **kwargs: Unpack[RTDetrImageProcessorKwargs]) -> None: BaseImageProcessorFast.__init__(self, **kwargs) - def preprocess( - self, - images: ImageInput, - **kwargs: Unpack[RTDetrImageProcessorKwargs], - ) -> BatchFeature: - return BaseImageProcessorFast.preprocess(self, images, **kwargs) - def prepare_annotation( self, image: torch.Tensor, @@ -322,21 +314,6 @@ def post_process_object_detection( return results - def from_dict(self): - raise NotImplementedError("No need to override this method for RT-DETR yet.") - - def post_process(self): - raise NotImplementedError("Post-processing is not implemented for RT-DETR yet.") - - def post_process_segmentation(self): - raise NotImplementedError("Segmentation post-processing is not implemented for RT-DETR yet.") - - def post_process_instance(self): - raise NotImplementedError("Instance post-processing is not implemented for RT-DETR yet.") - - def post_process_panoptic(self): - raise NotImplementedError("Panoptic post-processing is not implemented for RT-DETR yet.") - def post_process_instance_segmentation(self): raise NotImplementedError("Segmentation post-processing is not implemented for RT-DETR yet.") diff --git a/src/transformers/models/sam/modeling_sam.py b/src/transformers/models/sam/modeling_sam.py index cd59721180ba..6f1e2316793e 100644 --- a/src/transformers/models/sam/modeling_sam.py +++ b/src/transformers/models/sam/modeling_sam.py @@ -31,11 +31,7 @@ from ...modeling_outputs import BaseModelOutput from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack -from ...utils import ( - ModelOutput, - auto_docstring, - logging, -) +from ...utils import ModelOutput, auto_docstring, logging from .configuration_sam import SamConfig, SamMaskDecoderConfig, SamPromptEncoderConfig, SamVisionConfig @@ -848,16 +844,9 @@ def __init__(self, config, window_size): def forward(self, hidden_states: torch.Tensor, output_attentions=False) -> torch.Tensor: if output_attentions: logger.warning_once( - "`SamVisionSdpaAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support " - "`output_attentions=True`. Falling back to the manual attention implementation, but " - "specifying the manual implementation will be required from Transformers version v5.0.0 onwards. " - 'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - return super().forward( - hidden_states=hidden_states, - output_attentions=output_attentions, + f"{self.__class__.__name__} does not support `output_attentions=True`. The returned attention weights will " + "be `None`. If you want to get attention weights, please set `attn_implementation='eager'` when loading the model." ) - batch_size, height, width, _ = hidden_states.shape # qkv with shape (3, B, nHead, H * W, C) qkv = ( diff --git a/src/transformers/models/sam_hq/modeling_sam_hq.py b/src/transformers/models/sam_hq/modeling_sam_hq.py index 5dee354b2600..4de39435a8ec 100644 --- a/src/transformers/models/sam_hq/modeling_sam_hq.py +++ b/src/transformers/models/sam_hq/modeling_sam_hq.py @@ -283,16 +283,9 @@ def __init__(self, config, window_size): def forward(self, hidden_states: torch.Tensor, output_attentions=False) -> torch.Tensor: if output_attentions: logger.warning_once( - "`SamHQVisionSdpaAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support " - "`output_attentions=True`. Falling back to the manual attention implementation, but " - "specifying the manual implementation will be required from Transformers version v5.0.0 onwards. " - 'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + f"{self.__class__.__name__} does not support `output_attentions=True`. The returned attention weights will " + "be `None`. If you want to get attention weights, please set `attn_implementation='eager'` when loading the model." ) - return super().forward( - hidden_states=hidden_states, - output_attentions=output_attentions, - ) - batch_size, height, width, _ = hidden_states.shape # qkv with shape (3, B, nHead, H * W, C) qkv = ( diff --git a/src/transformers/models/seamless_m4t/processing_seamless_m4t.py b/src/transformers/models/seamless_m4t/processing_seamless_m4t.py index a506d81af61d..e210da1c9a8e 100644 --- a/src/transformers/models/seamless_m4t/processing_seamless_m4t.py +++ b/src/transformers/models/seamless_m4t/processing_seamless_m4t.py @@ -22,7 +22,6 @@ from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import logging -from ...utils.deprecation import deprecate_kwarg logger = logging.get_logger(__name__) @@ -59,11 +58,9 @@ class SeamlessM4TProcessor(ProcessorMixin): def __init__(self, feature_extractor, tokenizer): super().__init__(feature_extractor, tokenizer) - @deprecate_kwarg("audios", version="v4.59.0", new_name="audio") def __call__( self, text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None, - audios: Optional[AudioInput] = None, audio: Optional[AudioInput] = None, **kwargs: Unpack[ProcessingKwargs], ): @@ -92,15 +89,10 @@ def __call__( `None`). - **input_features** -- Audio input features to be fed to a model. Returned when `audios` is not `None`. """ - if text is not None and audios is not None: + if text is not None and audio is not None: raise ValueError( "Text and audios are mututally exclusive when passed to `SeamlessM4T`. Specify one or another." ) - if audio is None and audios is not None: - logger.warning( - "Passing `audios` as keyword argument is deprecated and will be removed in v4.63, please pass `audio` instead." - ) - audio = audios return super().__call__(text=text, audio=audio, **kwargs) diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py index 8cf3e2d24036..c7bfb4fd723b 100644 --- a/src/transformers/models/sew/modeling_sew.py +++ b/src/transformers/models/sew/modeling_sew.py @@ -20,7 +20,6 @@ # limitations under the License. import math -import warnings from collections.abc import Callable from typing import Optional, Union @@ -877,18 +876,6 @@ def tie_weights(self): elif target_lang is not None: self.load_adapter(target_lang, force_load=True) - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameter will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will @@ -1000,18 +987,6 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameters will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py index 7dda40514663..ce490457ec1c 100644 --- a/src/transformers/models/sew_d/modeling_sew_d.py +++ b/src/transformers/models/sew_d/modeling_sew_d.py @@ -15,7 +15,6 @@ """PyTorch SEW model.""" import math -import warnings from collections.abc import Sequence from typing import Optional, Union @@ -434,17 +433,6 @@ def forward(self, input_values): return hidden_states -class SEWDFeatureExtractor(SEWDFeatureEncoder): - def __init__(self, config): - super().__init__(config) - warnings.warn( - f"The class `{self.__class__.__name__}` has been depreciated " - "and will be removed in Transformers v5. " - f"Use `{self.__class__.__bases__[0].__name__}` instead.", - FutureWarning, - ) - - class ContextPooler(nn.Module): def __init__(self, config): super().__init__() @@ -1430,18 +1418,6 @@ def tie_weights(self): elif target_lang is not None: self.load_adapter(target_lang, force_load=True) - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameter will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will @@ -1554,18 +1530,6 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameters will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will diff --git a/src/transformers/models/swin2sr/image_processing_swin2sr.py b/src/transformers/models/swin2sr/image_processing_swin2sr.py index 0ba052e92e05..8a849499107f 100644 --- a/src/transformers/models/swin2sr/image_processing_swin2sr.py +++ b/src/transformers/models/swin2sr/image_processing_swin2sr.py @@ -32,7 +32,6 @@ ) from ...processing_utils import ImagesKwargs from ...utils import TensorType, filter_out_non_signature_kwargs, logging -from ...utils.deprecation import deprecate_kwarg logger = logging.get_logger(__name__) @@ -74,20 +73,6 @@ def __init__( pad_size = kwargs.get("pad_size") self.size_divisor = size_divisor if size_divisor is not None else pad_size - @property - def pad_size(self): - logger.warning( - "`self.pad_size` attribute is deprecated and will be removed in v5. Use `self.size_divisor` instead", - ) - return self.size_divisor - - @pad_size.setter - def pad_size(self, value): - logger.warning( - "`self.pad_size` attribute is deprecated and will be removed in v5. Use `self.size_divisor` instead", - ) - self.size_divisor = value - def pad( self, image: np.ndarray, @@ -130,7 +115,6 @@ def pad( ) @filter_out_non_signature_kwargs() - @deprecate_kwarg("pad_size", version="v5", new_name="size_divisor") def preprocess( self, images: ImageInput, diff --git a/src/transformers/models/swin2sr/image_processing_swin2sr_fast.py b/src/transformers/models/swin2sr/image_processing_swin2sr_fast.py index bee3da36c9b6..f85c124041bb 100644 --- a/src/transformers/models/swin2sr/image_processing_swin2sr_fast.py +++ b/src/transformers/models/swin2sr/image_processing_swin2sr_fast.py @@ -32,7 +32,6 @@ auto_docstring, logging, ) -from ...utils.deprecation import deprecate_kwarg from .image_processing_swin2sr import Swin2SRImageProcessorKwargs @@ -52,24 +51,9 @@ def __init__(self, **kwargs: Unpack[Swin2SRImageProcessorKwargs]): kwargs.setdefault("size_divisor", pad_size) super().__init__(**kwargs) - @property - def pad_size(self): - logger.warning( - "`self.pad_size` attribute is deprecated and will be removed in v5. Use `self.size_divisor` instead", - ) - return self.size_divisor - - @pad_size.setter - def pad_size(self, value): - logger.warning( - "`self.pad_size` attribute is deprecated and will be removed in v5. Use `self.size_divisor` instead", - ) - self.size_divisor = value - def preprocess(self, images: ImageInput, **kwargs: Unpack[Swin2SRImageProcessorKwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) - @deprecate_kwarg("size", version="v5", new_name="size_divisor") def pad(self, images: "torch.Tensor", size_divisor: int) -> "torch.Tensor": """ Pad an image to make the height and width divisible by `size_divisor`. @@ -93,7 +77,6 @@ def pad(self, images: "torch.Tensor", size_divisor: int) -> "torch.Tensor": padding_mode="symmetric", ) - @deprecate_kwarg("pad_size", version="v5", new_name="size_divisor") def _preprocess( self, images: list["torch.Tensor"], diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index 8bdec6b3cae8..40d63be3ac87 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -20,7 +20,6 @@ # limitations under the License. import math -import warnings from collections.abc import Callable from dataclasses import dataclass from typing import Optional, Union @@ -1072,18 +1071,6 @@ def set_gumbel_temperature(self, temperature: int): """ self.quantizer.temperature = temperature - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameters will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will @@ -1242,18 +1229,6 @@ def tie_weights(self): elif target_lang is not None: self.load_adapter(target_lang, force_load=True) - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameter will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will @@ -1365,18 +1340,6 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameters will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will diff --git a/src/transformers/models/unispeech/modular_unispeech.py b/src/transformers/models/unispeech/modular_unispeech.py index 534490235db1..f945c9b24e77 100644 --- a/src/transformers/models/unispeech/modular_unispeech.py +++ b/src/transformers/models/unispeech/modular_unispeech.py @@ -15,7 +15,6 @@ """PyTorch UniSpeech model.""" import math -import warnings from dataclasses import dataclass from typing import Optional, Union @@ -232,9 +231,6 @@ def __init__(self, config: UniSpeechConfig): # Initialize weights and apply final processing self.post_init() - def freeze_feature_extractor(self): - raise AttributeError("Not needed for UniSpeech") - def freeze_feature_encoder(self): raise AttributeError("Not needed for UniSpeech") @@ -318,18 +314,6 @@ def set_gumbel_temperature(self, temperature: int): """ self.quantizer.temperature = temperature - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameters will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py index 57e5d3cdbcc0..ba32ec6f5e52 100755 --- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py @@ -1083,18 +1083,6 @@ def set_gumbel_temperature(self, temperature: int): """ self.quantizer.temperature = temperature - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameters will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will @@ -1237,18 +1225,6 @@ def tie_weights(self): elif target_lang is not None: self.load_adapter(target_lang, force_load=True) - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameter will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will @@ -1360,18 +1336,6 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameters will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will @@ -1475,18 +1439,6 @@ def __init__(self, config): self.init_weights() - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameter will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will @@ -1643,18 +1595,6 @@ def __init__(self, config): self.init_weights() - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameter will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will diff --git a/src/transformers/models/unispeech_sat/modular_unispeech_sat.py b/src/transformers/models/unispeech_sat/modular_unispeech_sat.py index e209c7c18ea3..8939e6e2ebd6 100644 --- a/src/transformers/models/unispeech_sat/modular_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modular_unispeech_sat.py @@ -15,7 +15,6 @@ """PyTorch UniSpeechSat model.""" import math -import warnings from dataclasses import dataclass from typing import Optional, Union @@ -243,9 +242,6 @@ def __init__(self, config: UniSpeechSatConfig): # Initialize weights and apply final processing self.post_init() - def freeze_feature_extractor(self): - raise AttributeError("Not needed for UniSpeechSat") - def freeze_feature_encoder(self): raise AttributeError("Not needed for UniSpeechSat") @@ -336,18 +332,6 @@ def set_gumbel_temperature(self, temperature: int): """ self.quantizer.temperature = temperature - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameters will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will diff --git a/src/transformers/models/vilt/image_processing_vilt.py b/src/transformers/models/vilt/image_processing_vilt.py index 5c1b2acf6e4b..c74885375399 100644 --- a/src/transformers/models/vilt/image_processing_vilt.py +++ b/src/transformers/models/vilt/image_processing_vilt.py @@ -183,9 +183,6 @@ def __init__( do_pad: bool = True, **kwargs, ) -> None: - if "pad_and_return_pixel_mask" in kwargs: - do_pad = kwargs.pop("pad_and_return_pixel_mask") - super().__init__(**kwargs) size = size if size is not None else {"shortest_edge": 384} size = get_size_dict(size, default_to_square=False) @@ -199,19 +196,7 @@ def __init__( self.do_normalize = do_normalize self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD - self.do_pad = do_pad - - @classmethod - def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): - """ - Overrides the `from_dict` method from the base class to make sure `pad_and_return_pixel_mask` is updated if image processor - is created using from_dict and kwargs e.g. `ViltImageProcessor.from_pretrained(checkpoint, - pad_and_return_pixel_mask=False)` - """ - image_processor_dict = image_processor_dict.copy() - if "pad_and_return_pixel_mask" in kwargs: - image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") - return super().from_dict(image_processor_dict, **kwargs) + self.do_pad = kwargs.pop("pad_and_return_pixel_mask", do_pad) def resize( self, diff --git a/src/transformers/models/vitmatte/image_processing_vitmatte.py b/src/transformers/models/vitmatte/image_processing_vitmatte.py index ea54ba603435..7a6b847b34b6 100644 --- a/src/transformers/models/vitmatte/image_processing_vitmatte.py +++ b/src/transformers/models/vitmatte/image_processing_vitmatte.py @@ -35,7 +35,6 @@ ) from ...processing_utils import ImagesKwargs from ...utils import TensorType, filter_out_non_signature_kwargs, logging -from ...utils.deprecation import deprecate_kwarg logger = logging.get_logger(__name__) @@ -96,20 +95,6 @@ def __init__( size_divisibility = kwargs.get("size_divisibility") self.size_divisor = size_divisibility if size_divisibility is not None else size_divisor - @property - def size_divisibility(self): - logger.warning( - "`self.size_divisibility` attribute is deprecated and will be removed in v5. Use `self.size_divisor` instead" - ) - return self.size_divisor - - @size_divisibility.setter - def size_divisibility(self, value): - logger.warning( - "`self.size_divisibility` attribute is deprecated and will be removed in v5. Use `self.size_divisor` instead" - ) - self.size_divisor = value - def pad_image( self, image: np.ndarray, @@ -152,7 +137,6 @@ def pad_image( return image @filter_out_non_signature_kwargs() - @deprecate_kwarg("size_divisibility", version="v5", new_name="size_divisor") def preprocess( self, images: ImageInput, diff --git a/src/transformers/models/vitmatte/image_processing_vitmatte_fast.py b/src/transformers/models/vitmatte/image_processing_vitmatte_fast.py index dd09b987090d..54f54d18cc89 100644 --- a/src/transformers/models/vitmatte/image_processing_vitmatte_fast.py +++ b/src/transformers/models/vitmatte/image_processing_vitmatte_fast.py @@ -61,20 +61,6 @@ def __init__(self, **kwargs: Unpack[VitMatteImageProcessorKwargs]) -> None: kwargs.setdefault("size_divisor", size_divisibility) super().__init__(**kwargs) - @property - def size_divisibility(self): - logger.warning( - "`self.size_divisibility` attribute is deprecated and will be removed in v5. Use `self.size_divisor` instead" - ) - return self.size_divisor - - @size_divisibility.setter - def size_divisibility(self, value): - logger.warning( - "`self.size_divisibility` attribute is deprecated and will be removed in v5. Use `self.size_divisor` instead" - ) - self.size_divisor = value - def _pad_image( self, images: torch.Tensor, diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index 82399d0933dc..a3eaf7d66440 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -423,17 +423,6 @@ def forward(self, input_values): return hidden_states -class Wav2Vec2FeatureExtractor(Wav2Vec2FeatureEncoder): - def __init__(self, config): - super().__init__(config) - warnings.warn( - f"The class `{self.__class__.__name__}` has been depreciated " - "and will be removed in Transformers v5. " - f"Use `{self.__class__.__bases__[0].__name__}` instead.", - FutureWarning, - ) - - class Wav2Vec2FeatureProjection(nn.Module): def __init__(self, config): super().__init__() @@ -1165,7 +1154,6 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs): token = kwargs.pop("token", None) revision = kwargs.pop("revision", None) use_safetensors = kwargs.pop("use_safetensors", None) - model_path_or_id = self.config._name_or_path state_dict = None @@ -1290,18 +1278,6 @@ def __init__(self, config: Wav2Vec2Config): # Initialize weights and apply final processing self.post_init() - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameters will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will @@ -1439,18 +1415,6 @@ def set_gumbel_temperature(self, temperature: int): """ self.quantizer.temperature = temperature - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameters will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will @@ -1741,18 +1705,6 @@ def tie_weights(self): elif target_lang is not None: self.load_adapter(target_lang, force_load=True) - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameter will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will @@ -1864,18 +1816,6 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameters will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will @@ -1979,18 +1919,6 @@ def __init__(self, config): self.init_weights() - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameter will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will @@ -2147,18 +2075,6 @@ def __init__(self, config): self.init_weights() - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameter will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will diff --git a/src/transformers/models/wav2vec2/processing_wav2vec2.py b/src/transformers/models/wav2vec2/processing_wav2vec2.py index 8a8b7ded7116..6b865f708a5f 100644 --- a/src/transformers/models/wav2vec2/processing_wav2vec2.py +++ b/src/transformers/models/wav2vec2/processing_wav2vec2.py @@ -21,8 +21,6 @@ from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import AudioInput, PreTokenizedInput, TextInput -from .feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor -from .tokenization_wav2vec2 import Wav2Vec2CTCTokenizer class Wav2Vec2ProcessorKwargs(ProcessingKwargs, total=False): @@ -47,25 +45,6 @@ class Wav2Vec2Processor(ProcessorMixin): def __init__(self, feature_extractor, tokenizer): super().__init__(feature_extractor, tokenizer) - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): - try: - return super().from_pretrained(pretrained_model_name_or_path, **kwargs) - except (OSError, ValueError): - warnings.warn( - f"Loading a tokenizer inside {cls.__name__} from a config that does not" - " include a `tokenizer_class` attribute is deprecated and will be " - "removed in v5. Please add `'tokenizer_class': 'Wav2Vec2CTCTokenizer'`" - " attribute to either your `config.json` or `tokenizer_config.json` " - "file to suppress this warning: ", - FutureWarning, - ) - - feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs) - tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) - - return cls(feature_extractor=feature_extractor, tokenizer=tokenizer) - def __call__( self, audio: Optional[AudioInput] = None, diff --git a/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py index 3bce99771f55..7257375d018f 100644 --- a/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +++ b/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py @@ -689,9 +689,6 @@ def __init__(self, config: Wav2Vec2BertConfig): # Initialize weights and apply final processing self.post_init() - def freeze_feature_extractor(self): - raise AttributeError("Not needed for Wav2Vec2Bert") - def freeze_feature_encoder(self): raise AttributeError("Not needed for Wav2Vec2Bert") @@ -838,9 +835,6 @@ class Wav2Vec2BertForSequenceClassification(Wav2Vec2ForSequenceClassification): def __init__(self, config): super().__init__(config) - def freeze_feature_extractor(self): - raise AttributeError("Not needed for Wav2Vec2Bert") - def freeze_feature_encoder(self): raise AttributeError("Not needed for Wav2Vec2Bert") diff --git a/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py index 90da8b651677..09f27f39ff86 100644 --- a/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py +++ b/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py @@ -16,13 +16,10 @@ Speech processor class for Wav2Vec2-BERT """ -import warnings from typing import Optional, Union from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import AudioInput, PreTokenizedInput, TextInput -from ..seamless_m4t.feature_extraction_seamless_m4t import SeamlessM4TFeatureExtractor -from ..wav2vec2.tokenization_wav2vec2 import Wav2Vec2CTCTokenizer class Wav2Vec2BertProcessorKwargs(ProcessingKwargs, total=False): @@ -47,25 +44,6 @@ class Wav2Vec2BertProcessor(ProcessorMixin): def __init__(self, feature_extractor, tokenizer): super().__init__(feature_extractor, tokenizer) - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): - try: - return super().from_pretrained(pretrained_model_name_or_path, **kwargs) - except OSError: - warnings.warn( - f"Loading a tokenizer inside {cls.__name__} from a config that does not" - " include a `tokenizer_class` attribute is deprecated and will be " - "removed in v5. Please add `'tokenizer_class': 'Wav2Vec2CTCTokenizer'`" - " attribute to either your `config.json` or `tokenizer_config.json` " - "file to suppress this warning: ", - FutureWarning, - ) - - feature_extractor = SeamlessM4TFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs) - tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) - - return cls(feature_extractor=feature_extractor, tokenizer=tokenizer) - def __call__( self, audio: Optional[AudioInput] = None, diff --git a/src/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py index 7a0e757a8496..e8a0a0dccdd1 100644 --- a/src/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +++ b/src/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py @@ -662,17 +662,11 @@ def __init__(self, config: Wav2Vec2ConformerConfig): # Initialize weights and apply final processing self.post_init() - def freeze_feature_extractor(self): - raise AttributeError("Not needed for Wav2Vec2Conformer") - class Wav2Vec2ConformerForPreTraining(Wav2Vec2ForPreTraining): def __init__(self, config: Wav2Vec2ConformerConfig): super().__init__(config) - def freeze_feature_extractor(self): - raise AttributeError("Not needed for Wav2Vec2Conformer") - class Wav2Vec2ConformerForCTC(Wav2Vec2ForCTC): def __init__(self, config, target_lang: Optional[str] = None): @@ -687,9 +681,6 @@ def __init__(self, config, target_lang: Optional[str] = None): def tie_weights(self): raise AttributeError("Not needed for Wav2Vec2Conformer") - def freeze_feature_extractor(self): - raise AttributeError("Not needed for Wav2Vec2Conformer") - def freeze_base_model(self): raise AttributeError("Not needed for Wav2Vec2Conformer") @@ -698,25 +689,16 @@ class Wav2Vec2ConformerForSequenceClassification(Wav2Vec2ForSequenceClassificati def __init__(self, config): super().__init__(config) - def freeze_feature_extractor(self): - raise AttributeError("Not needed for Wav2Vec2Conformer") - class Wav2Vec2ConformerForAudioFrameClassification(Wav2Vec2ForAudioFrameClassification): def __init__(self, config): super().__init__(config) - def freeze_feature_extractor(self): - raise AttributeError("Not needed for Wav2Vec2Conformer") - class Wav2Vec2ConformerForXVector(Wav2Vec2ForXVector): def __init__(self, config): super().__init__(config) - def freeze_feature_extractor(self): - raise AttributeError("Not needed for Wav2Vec2Conformer") - __all__ = [ "Wav2Vec2ConformerForAudioFrameClassification", diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py index 274d83fa8914..ead5bf104dbd 100755 --- a/src/transformers/models/wavlm/modeling_wavlm.py +++ b/src/transformers/models/wavlm/modeling_wavlm.py @@ -983,18 +983,6 @@ def __init__(self, config: WavLMConfig): # Initialize weights and apply final processing self.post_init() - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameters will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will @@ -1166,18 +1154,6 @@ def tie_weights(self): elif target_lang is not None: self.load_adapter(target_lang, force_load=True) - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameter will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will @@ -1289,18 +1265,6 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameters will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will @@ -1404,18 +1368,6 @@ def __init__(self, config): self.init_weights() - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameter will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will @@ -1572,18 +1524,6 @@ def __init__(self, config): self.init_weights() - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameter will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py index 34d9a8965be8..d2a9914aa822 100644 --- a/src/transformers/models/whisper/tokenization_whisper.py +++ b/src/transformers/models/whisper/tokenization_whisper.py @@ -16,7 +16,6 @@ import json import os -import warnings from functools import lru_cache from typing import Optional, Union @@ -499,20 +498,6 @@ def _convert_id_to_token(self, index): """ return self.decoder.get(index, "") - def _normalize(self, text): - warnings.warn( - "The private method `_normalize` is deprecated and will be removed in v5 of Transformers." - "You can normalize an input string using the Whisper English normalizer using the `normalize` method." - ) - return self.normalize(text) - - def _basic_normalize(self, text, remove_diacritics=False): - warnings.warn( - "The private method `_basic_normalize` is deprecated and will be removed in v5 of Transformers." - "You can normalize an input string using the Whisper basic normalizer using the `basic_normalize` method." - ) - return self.basic_normalize(text, remove_diacritics=remove_diacritics) - def normalize(self, text): """ Normalize a given string using the `EnglishTextNormalizer` class, which performs commons transformation on diff --git a/src/transformers/models/whisper/tokenization_whisper_fast.py b/src/transformers/models/whisper/tokenization_whisper_fast.py index fbcf8ea757bd..904f099243f9 100644 --- a/src/transformers/models/whisper/tokenization_whisper_fast.py +++ b/src/transformers/models/whisper/tokenization_whisper_fast.py @@ -17,7 +17,6 @@ import json import os import re -import warnings from functools import lru_cache from typing import Optional @@ -393,30 +392,14 @@ def _decode( text = super()._decode(*args, **kwargs) if normalize: - clean_text = self._normalize(text) + clean_text = self.normalize(text) return clean_text elif basic_normalize: - clean_text = self._basic_normalize(text, remove_diacritics=remove_diacritics) + clean_text = self.basic_normalize(text, remove_diacritics=remove_diacritics) return clean_text else: return text - # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._normalize - def _normalize(self, text): - warnings.warn( - "The private method `_normalize` is deprecated and will be removed in v5 of Transformers." - "You can normalize an input string using the Whisper English normalizer using the `normalize` method." - ) - return self.normalize(text) - - # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._basic_normalize - def _basic_normalize(self, text, remove_diacritics=False): - warnings.warn( - "The private method `_basic_normalize` is deprecated and will be removed in v5 of Transformers." - "You can normalize an input string using the Whisper basic normalizer using the `basic_normalize` method." - ) - return self.basic_normalize(text, remove_diacritics=remove_diacritics) - # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.normalize def normalize(self, text): """ diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py index 0d33b6c761bf..ed4ff14f6f91 100644 --- a/src/transformers/models/yolos/image_processing_yolos.py +++ b/src/transformers/models/yolos/image_processing_yolos.py @@ -786,18 +786,7 @@ def __init__( pad_size: Optional[dict[str, int]] = None, **kwargs, ) -> None: - if "pad_and_return_pixel_mask" in kwargs: - do_pad = kwargs.pop("pad_and_return_pixel_mask") - - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` parameter is deprecated and will be removed in v4.26. " - "Please specify in `size['longest_edge'] instead`.", - ) - max_size = kwargs.pop("max_size") - else: - max_size = None if size is None else 1333 - + max_size = None if size is None else kwargs.pop("max_size", 1333) size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} size = get_size_dict(size, max_size=max_size, default_to_square=False) @@ -816,7 +805,7 @@ def __init__( self.do_convert_annotations = do_convert_annotations self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD - self.do_pad = do_pad + self.do_pad = kwargs.pop("pad_and_return_pixel_mask", do_pad) self.pad_size = pad_size self._valid_processor_keys = [ "images", @@ -840,21 +829,6 @@ def __init__( "input_data_format", ] - @classmethod - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->Yolos - def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): - """ - Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is - created using from_dict and kwargs e.g. `YolosImageProcessor.from_pretrained(checkpoint, size=600, - max_size=800)` - """ - image_processor_dict = image_processor_dict.copy() - if "max_size" in kwargs: - image_processor_dict["max_size"] = kwargs.pop("max_size") - if "pad_and_return_pixel_mask" in kwargs: - image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") - return super().from_dict(image_processor_dict, **kwargs) - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation def prepare_annotation( self, @@ -923,15 +897,7 @@ def resize( input_data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format of the input image. If not provided, it will be inferred. """ - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` parameter is deprecated and will be removed in v4.26. " - "Please specify in `size['longest_edge'] instead`.", - ) - max_size = kwargs.pop("max_size") - else: - max_size = None - size = get_size_dict(size, max_size=max_size, default_to_square=False) + size = get_size_dict(size, max_size=None, default_to_square=False) if "shortest_edge" in size and "longest_edge" in size: new_size = get_resize_output_image_size( image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format @@ -1264,20 +1230,6 @@ def preprocess( provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest height and width in the batch. """ - if "pad_and_return_pixel_mask" in kwargs: - logger.warning_once( - "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in v4.33, " - "use `do_pad` instead.", - ) - do_pad = kwargs.pop("pad_and_return_pixel_mask") - - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` argument is deprecated and will be removed in v4.33, use" - " `size['longest_edge']` instead.", - ) - size = kwargs.pop("max_size") - do_resize = self.do_resize if do_resize is None else do_resize size = self.size if size is None else size size = get_size_dict(size=size, default_to_square=False) @@ -1427,48 +1379,6 @@ def preprocess( return encoded_inputs - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process with Detr->Yolos - def post_process(self, outputs, target_sizes): - """ - Converts the raw output of [`YolosForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, - bottom_right_x, bottom_right_y) format. - - Args: - outputs ([`YolosObjectDetectionOutput`]): - Raw outputs of the model. - target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): - Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the - original image size (before any data augmentation). For visualization, this should be the image size - after data augment, but before padding. - Returns: - `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image - in the batch as predicted by the model. - """ - logger.warning_once( - "`post_process` is deprecated and will be removed in v5 of Transformers, please use" - " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", - ) - - out_logits, out_bbox = outputs.logits, outputs.pred_boxes - - if len(out_logits) != len(target_sizes): - raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") - if target_sizes.shape[1] != 2: - raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") - - prob = nn.functional.softmax(out_logits, -1) - scores, labels = prob[..., :-1].max(-1) - - # convert to [x0, y0, x1, y1] format - boxes = center_to_corners_format(out_bbox) - # and from relative [0, 1] to absolute [0, height] coordinates - img_h, img_w = target_sizes.unbind(1) - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) - boxes = boxes * scale_fct[:, None, :] - - results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] - return results - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_object_detection with Detr->Yolos def post_process_object_detection( self, outputs, threshold: float = 0.5, target_sizes: Optional[Union[TensorType, list[tuple]]] = None diff --git a/src/transformers/models/yolos/image_processing_yolos_fast.py b/src/transformers/models/yolos/image_processing_yolos_fast.py index fc1f1852862f..53e7318c6285 100644 --- a/src/transformers/models/yolos/image_processing_yolos_fast.py +++ b/src/transformers/models/yolos/image_processing_yolos_fast.py @@ -26,19 +26,16 @@ AnnotationFormat, AnnotationType, ChannelDimension, - ImageInput, PILImageResampling, get_image_size, validate_annotations, ) from ...processing_utils import Unpack -from ...utils import TensorType, auto_docstring, logging +from ...utils import TensorType, auto_docstring from ...utils.import_utils import requires from .image_processing_yolos import YolosImageProcessorKwargs -logger = logging.get_logger(__name__) - SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) @@ -305,19 +302,10 @@ class YolosImageProcessorFast(BaseImageProcessorFast): valid_kwargs = YolosImageProcessorKwargs def __init__(self, **kwargs: Unpack[YolosImageProcessorKwargs]) -> None: - if "pad_and_return_pixel_mask" in kwargs: - kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") + kwargs.setdefault("do_pad", kwargs.pop("pad_and_return_pixel_mask", self.do_pad)) size = kwargs.pop("size", None) - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` parameter is deprecated and will be removed in v4.26. " - "Please specify in `size['longest_edge'] instead`.", - ) - max_size = kwargs.pop("max_size") - else: - max_size = None if size is None else 1333 - + max_size = None if size is None else kwargs.pop("max_size", 1333) size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} self.size = get_size_dict(size, max_size=max_size, default_to_square=False) @@ -329,20 +317,6 @@ def __init__(self, **kwargs: Unpack[YolosImageProcessorKwargs]) -> None: super().__init__(**kwargs) - @classmethod - def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): - """ - Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is - created using from_dict and kwargs e.g. `YolosImageProcessorFast.from_pretrained(checkpoint, size=600, - max_size=800)` - """ - image_processor_dict = image_processor_dict.copy() - if "max_size" in kwargs: - image_processor_dict["max_size"] = kwargs.pop("max_size") - if "pad_and_return_pixel_mask" in kwargs: - image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") - return super().from_dict(image_processor_dict, **kwargs) - def prepare_annotation( self, image: torch.Tensor, @@ -562,28 +536,6 @@ def pad( return image, pixel_mask, annotation - @auto_docstring - def preprocess( - self, - images: ImageInput, - **kwargs: Unpack[YolosImageProcessorKwargs], - ) -> BatchFeature: - if "pad_and_return_pixel_mask" in kwargs: - kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") - logger.warning_once( - "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, " - "use `do_pad` instead." - ) - - if "max_size" in kwargs: - logger.warning_once( - "The `max_size` argument is deprecated and will be removed in a future version, use" - " `size['longest_edge']` instead." - ) - kwargs["size"] = kwargs.pop("max_size") - - return super().preprocess(images, **kwargs) - def _preprocess( self, images: list["torch.Tensor"], @@ -700,51 +652,6 @@ def _preprocess( ] return encoded_inputs - def post_process(self, outputs, target_sizes): - """ - Converts the raw output of [`YolosForObjectDetection`] into final bounding boxes in (top_left_x, - top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch. - - Args: - outputs ([`YolosObjectDetectionOutput`]): - Raw outputs of the model. - target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): - Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the - original image size (before any data augmentation). For visualization, this should be the image size - after data augment, but before padding. - Returns: - `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image - in the batch as predicted by the model. - """ - logger.warning_once( - "`post_process` is deprecated and will be removed in v5 of Transformers, please use" - " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", - ) - - out_logits, out_bbox = outputs.logits, outputs.pred_boxes - - if len(out_logits) != len(target_sizes): - raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") - if target_sizes.shape[1] != 2: - raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") - - prob = out_logits.sigmoid() - topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1) - scores = topk_values - topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor") - labels = topk_indexes % out_logits.shape[2] - boxes = center_to_corners_format(out_bbox) - boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) - - # and from relative [0, 1] to absolute [0, height] coordinates - img_h, img_w = target_sizes.unbind(1) - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) - boxes = boxes * scale_fct[:, None, :] - - results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] - - return results - def post_process_object_detection( self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, list[tuple]] = None, top_k: int = 100 ): diff --git a/src/transformers/models/yolos/modular_yolos.py b/src/transformers/models/yolos/modular_yolos.py index 13f3db41b675..5de64f7cf5cf 100644 --- a/src/transformers/models/yolos/modular_yolos.py +++ b/src/transformers/models/yolos/modular_yolos.py @@ -64,51 +64,6 @@ def get_size_with_aspect_ratio( class YolosImageProcessorFast(DetrImageProcessorFast): - def post_process(self, outputs, target_sizes): - """ - Converts the raw output of [`YolosForObjectDetection`] into final bounding boxes in (top_left_x, - top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch. - - Args: - outputs ([`YolosObjectDetectionOutput`]): - Raw outputs of the model. - target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): - Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the - original image size (before any data augmentation). For visualization, this should be the image size - after data augment, but before padding. - Returns: - `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image - in the batch as predicted by the model. - """ - logger.warning_once( - "`post_process` is deprecated and will be removed in v5 of Transformers, please use" - " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", - ) - - out_logits, out_bbox = outputs.logits, outputs.pred_boxes - - if len(out_logits) != len(target_sizes): - raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") - if target_sizes.shape[1] != 2: - raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") - - prob = out_logits.sigmoid() - topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1) - scores = topk_values - topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor") - labels = topk_indexes % out_logits.shape[2] - boxes = center_to_corners_format(out_bbox) - boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) - - # and from relative [0, 1] to absolute [0, height] coordinates - img_h, img_w = target_sizes.unbind(1) - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) - boxes = boxes * scale_fct[:, None, :] - - results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] - - return results - def post_process_object_detection( self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, list[tuple]] = None, top_k: int = 100 ): @@ -168,15 +123,6 @@ def post_process_object_detection( return results - def post_process_segmentation(self): - raise NotImplementedError("Segmentation post-processing is not implemented for Deformable DETR yet.") - - def post_process_instance(self): - raise NotImplementedError("Instance post-processing is not implemented for Deformable DETR yet.") - - def post_process_panoptic(self): - raise NotImplementedError("Panoptic post-processing is not implemented for Deformable DETR yet.") - def post_process_instance_segmentation(self): raise NotImplementedError("Segmentation post-processing is not implemented for Deformable DETR yet.") diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 2fdd83da233b..74c04d604ff3 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -61,7 +61,6 @@ logging, ) from .utils.chat_template_utils import render_jinja_template -from .utils.deprecation import deprecate_kwarg from .utils.type_validators import ( device_validator, image_size_validator, @@ -630,6 +629,9 @@ def __call__( Returns: [`BatchFeature`]: A [`BatchFeature`] object with processed inputs in a dict format. """ + if "audios" in kwargs and audio is None: + raise ValueError("You passed keyword argument `audios` which is deprecated. Please use `audio` instead.") + if images is None and text is None and videos is None and audio is None: raise ValueError(f"You need to provide at least one input to call {self.__class__.__name__}") @@ -1534,12 +1536,6 @@ def validate_init_kwargs(processor_config, valid_kwargs): return unused_kwargs, valid_kwargs - @deprecate_kwarg("video_fps", version="4.58", new_name="fps") - @deprecate_kwarg( - "video_load_backend", - version="4.59", - additional_message=". This function will use `torchcodec` by default, or `torchvision` if `torchcodec` is not installed.", - ) def apply_chat_template( self, conversation: Union[list[dict[str, str]], list[list[dict[str, str]]]], @@ -1627,9 +1623,6 @@ def apply_chat_template( if value is not None and not isinstance(value, dict): processed_kwargs[kwarg_type][key] = value - # pop unused and deprecated kwarg - kwargs.pop("video_load_backend", None) - # Pass unprocessed custom kwargs processed_kwargs["template_kwargs"].update(kwargs) diff --git a/tests/fixtures/config.json b/tests/fixtures/config.json new file mode 100644 index 000000000000..5436a4252722 --- /dev/null +++ b/tests/fixtures/config.json @@ -0,0 +1,4 @@ +{ + "model_type": "wav2vec2" +} + \ No newline at end of file diff --git a/tests/models/auto/test_processor_auto.py b/tests/models/auto/test_processor_auto.py index 3719c89b5660..4f7c1e808770 100644 --- a/tests/models/auto/test_processor_auto.py +++ b/tests/models/auto/test_processor_auto.py @@ -62,6 +62,7 @@ SAMPLE_PROCESSOR_CONFIG = get_tests_dir("fixtures/dummy_feature_extractor_config.json") SAMPLE_VOCAB_LLAMA = get_tests_dir("fixtures/test_sentencepiece.model") SAMPLE_VOCAB = get_tests_dir("fixtures/vocab.json") +SAMPLE_CONFIG = get_tests_dir("fixtures/config.json") SAMPLE_PROCESSOR_CONFIG_DIR = get_tests_dir("fixtures") @@ -102,6 +103,7 @@ def test_processor_from_local_directory_from_extractor_config(self): # copy relevant files copyfile(SAMPLE_PROCESSOR_CONFIG, os.path.join(tmpdirname, FEATURE_EXTRACTOR_NAME)) copyfile(SAMPLE_VOCAB, os.path.join(tmpdirname, "vocab.json")) + copyfile(SAMPLE_CONFIG, os.path.join(tmpdirname, "config.json")) processor = AutoProcessor.from_pretrained(tmpdirname) diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index 035236bcd5fc..d852ffb6a943 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -942,7 +942,7 @@ def test_get_text_features(self): model = Blip2Model(config).to(torch_device) model.eval() text_features = model.get_text_features(**inputs_dict) - self.assertEqual(text_features[0].shape, (1, 10, config.text_config.vocab_size)) + self.assertEqual(text_features[0].shape, (10, config.text_config.vocab_size)) def test_get_image_features(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -957,11 +957,7 @@ def test_get_image_features(self): image_features = model.get_image_features(**inputs_dict) self.assertEqual( image_features[0].shape, - ( - self.model_tester.vision_model_tester.batch_size, - self.model_tester.vision_model_tester.seq_length, - config.vision_config.hidden_size, - ), + (config.vision_config.hidden_size,), ) def test_get_qformer_features(self): @@ -977,7 +973,7 @@ def test_get_qformer_features(self): qformer_features = model.get_qformer_features(**inputs_dict) self.assertEqual( qformer_features[0].shape, - (self.model_tester.vision_model_tester.batch_size, 10, config.vision_config.hidden_size), + (10, config.vision_config.hidden_size), ) @unittest.skip("T5 backbone deepcopies the configs, and fixing it would be more involved") diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py index 3075a0df1e58..4327f0a15878 100644 --- a/tests/models/clap/test_modeling_clap.py +++ b/tests/models/clap/test_modeling_clap.py @@ -562,7 +562,7 @@ def test_integration_unfused(self): processor = ClapProcessor.from_pretrained(model_id) for padding in self.paddings: - inputs = processor(audios=audio_sample["audio"]["array"], return_tensors="pt", padding=padding).to( + inputs = processor(audio=audio_sample["audio"]["array"], return_tensors="pt", padding=padding).to( torch_device ) @@ -590,7 +590,7 @@ def test_integration_fused(self): for padding in self.paddings: inputs = processor( - audios=audio_sample["audio"]["array"], return_tensors="pt", padding=padding, truncation="fusion" + audio=audio_sample["audio"]["array"], return_tensors="pt", padding=padding, truncation="fusion" ).to(torch_device) audio_embed = model.get_audio_features(**inputs) @@ -616,7 +616,7 @@ def test_batched_fused(self): processor = ClapProcessor.from_pretrained(model_id) for padding in self.paddings: - inputs = processor(audios=audio_samples, return_tensors="pt", padding=padding, truncation="fusion").to( + inputs = processor(audio=audio_samples, return_tensors="pt", padding=padding, truncation="fusion").to( torch_device ) @@ -643,7 +643,7 @@ def test_batched_unfused(self): processor = ClapProcessor.from_pretrained(model_id) for padding in self.paddings: - inputs = processor(audios=audio_samples, return_tensors="pt", padding=padding).to(torch_device) + inputs = processor(audio=audio_samples, return_tensors="pt", padding=padding).to(torch_device) audio_embed = model.get_audio_features(**inputs) expected_mean = EXPECTED_MEANS_FUSED[padding] diff --git a/tests/models/clap/test_processing_clap.py b/tests/models/clap/test_processing_clap.py index cce1705a51a7..ce6000ed2c57 100644 --- a/tests/models/clap/test_processing_clap.py +++ b/tests/models/clap/test_processing_clap.py @@ -79,7 +79,7 @@ def test_feature_extractor(self): raw_speech = floats_list((3, 1000)) input_feat_extract = feature_extractor(raw_speech, return_tensors="np") - input_processor = processor(audios=raw_speech, return_tensors="np") + input_processor = processor(audio=raw_speech, return_tensors="np") for key in input_feat_extract: self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) diff --git a/tests/models/conditional_detr/test_image_processing_conditional_detr.py b/tests/models/conditional_detr/test_image_processing_conditional_detr.py index 4b02a1257844..c8392eff0d1d 100644 --- a/tests/models/conditional_detr/test_image_processing_conditional_detr.py +++ b/tests/models/conditional_detr/test_image_processing_conditional_detr.py @@ -160,11 +160,8 @@ def test_image_processor_from_dict_with_kwargs(self): self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333}) self.assertEqual(image_processor.do_pad, True) - image_processor = image_processing_class.from_dict( - self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False - ) - self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84}) - self.assertEqual(image_processor.do_pad, False) + image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42) + self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 1333}) @slow def test_call_pytorch_with_coco_detection_annotations(self): diff --git a/tests/models/deformable_detr/test_image_processing_deformable_detr.py b/tests/models/deformable_detr/test_image_processing_deformable_detr.py index f6bf929eadf9..594bcbceb178 100644 --- a/tests/models/deformable_detr/test_image_processing_deformable_detr.py +++ b/tests/models/deformable_detr/test_image_processing_deformable_detr.py @@ -165,11 +165,8 @@ def test_image_processor_from_dict_with_kwargs(self): self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333}) self.assertEqual(image_processor.do_pad, True) - image_processor = image_processing_class.from_dict( - self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False - ) - self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84}) - self.assertEqual(image_processor.do_pad, False) + image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42) + self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 1333}) @slow def test_call_pytorch_with_coco_detection_annotations(self): diff --git a/tests/models/detr/test_image_processing_detr.py b/tests/models/detr/test_image_processing_detr.py index d9cff61a1061..3a51b05dc7e3 100644 --- a/tests/models/detr/test_image_processing_detr.py +++ b/tests/models/detr/test_image_processing_detr.py @@ -169,11 +169,8 @@ def test_image_processor_from_dict_with_kwargs(self): self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333}) self.assertEqual(image_processor.do_pad, True) - image_processor = image_processing_class.from_dict( - self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False - ) - self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84}) - self.assertEqual(image_processor.do_pad, False) + image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42) + self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 1333}) def test_should_raise_if_annotation_format_invalid(self): image_processor_dict = self.image_processor_tester.prepare_image_processor_dict() diff --git a/tests/models/glm4v/test_processor_glm4v.py b/tests/models/glm4v/test_processor_glm4v.py index b22cdce7a4e9..0b52faa66b3c 100644 --- a/tests/models/glm4v/test_processor_glm4v.py +++ b/tests/models/glm4v/test_processor_glm4v.py @@ -215,7 +215,7 @@ def test_apply_chat_template_video_frame_sampling(self): add_generation_prompt=True, tokenize=True, return_dict=True, - video_fps=video_fps, + fps=video_fps, ) self.assertTrue(self.videos_input_name in out_dict_with_video) self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 8) diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py index 2c4ecb297e62..b085cfeda7f8 100644 --- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py +++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py @@ -179,11 +179,8 @@ def test_image_processor_from_dict_with_kwargs(self): self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333}) self.assertEqual(image_processor.do_pad, True) - image_processor = image_processing_class.from_dict( - self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False - ) - self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84}) - self.assertEqual(image_processor.do_pad, False) + image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42) + self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 1333}) def test_post_process_object_detection(self): for image_processing_class in self.image_processor_list: diff --git a/tests/models/maskformer/test_image_processing_maskformer.py b/tests/models/maskformer/test_image_processing_maskformer.py index bd5c8abec5bc..be61f940e12d 100644 --- a/tests/models/maskformer/test_image_processing_maskformer.py +++ b/tests/models/maskformer/test_image_processing_maskformer.py @@ -432,30 +432,6 @@ def test_binary_mask_to_rle(self): self.assertEqual(rle[0], 21) self.assertEqual(rle[1], 45) - def test_post_process_segmentation(self): - for image_processing_class in self.image_processor_list: - feature_extractor = image_processing_class(num_labels=self.image_processor_tester.num_classes) - outputs = self.image_processor_tester.get_fake_maskformer_outputs() - segmentation = feature_extractor.post_process_segmentation(outputs) - - self.assertEqual( - segmentation.shape, - ( - self.image_processor_tester.batch_size, - self.image_processor_tester.num_classes, - self.image_processor_tester.height, - self.image_processor_tester.width, - ), - ) - - target_size = (1, 4) - segmentation = feature_extractor.post_process_segmentation(outputs, target_size=target_size) - - self.assertEqual( - segmentation.shape, - (self.image_processor_tester.batch_size, self.image_processor_tester.num_classes, *target_size), - ) - def test_post_process_semantic_segmentation(self): for image_processing_class in self.image_processor_list: feature_extractor = image_processing_class(num_labels=self.image_processor_tester.num_classes) diff --git a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py index 95f858df2f5b..b2ac9930a4f9 100644 --- a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py +++ b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py @@ -867,7 +867,7 @@ def input_audio(self): sampling_rate = 16000 input_features = torch.rand((2, seq_len)) - return self.processor(audios=[input_features.tolist()], sampling_rate=sampling_rate, return_tensors="pt").to( + return self.processor(audio=[input_features.tolist()], sampling_rate=sampling_rate, return_tensors="pt").to( torch_device ) diff --git a/tests/models/seamless_m4t/test_processing_seamless_m4t.py b/tests/models/seamless_m4t/test_processing_seamless_m4t.py index 929255a82fbd..5fccf3d92a4c 100644 --- a/tests/models/seamless_m4t/test_processing_seamless_m4t.py +++ b/tests/models/seamless_m4t/test_processing_seamless_m4t.py @@ -86,7 +86,7 @@ def test_feature_extractor(self): raw_speech = floats_list((3, 1000)) input_feat_extract = feature_extractor(raw_speech, return_tensors="np") - input_processor = processor(audios=raw_speech, return_tensors="np") + input_processor = processor(audio=raw_speech, return_tensors="np") for key in input_feat_extract: self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) diff --git a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py index 3c68901fe98a..1a07f8b7d3c7 100644 --- a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py +++ b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py @@ -929,7 +929,7 @@ def input_audio(self): sampling_rate = 16000 input_features = torch.rand((2, seq_len)) - return self.processor(audios=[input_features.tolist()], sampling_rate=sampling_rate, return_tensors="pt").to( + return self.processor(audio=[input_features.tolist()], sampling_rate=sampling_rate, return_tensors="pt").to( torch_device ) diff --git a/tests/models/swin2sr/test_image_processing_swin2sr.py b/tests/models/swin2sr/test_image_processing_swin2sr.py index 2cf3edaf4386..dfff76b98454 100644 --- a/tests/models/swin2sr/test_image_processing_swin2sr.py +++ b/tests/models/swin2sr/test_image_processing_swin2sr.py @@ -117,7 +117,6 @@ def test_image_processor_properties(self): self.assertTrue(hasattr(image_processing, "rescale_factor")) self.assertTrue(hasattr(image_processing, "do_pad")) self.assertTrue(hasattr(image_processing, "size_divisor")) - self.assertTrue(hasattr(image_processing, "pad_size")) # deprecated but should be available def calculate_expected_size(self, image): old_height, old_width = get_image_size(image) diff --git a/tests/models/vitmatte/test_image_processing_vitmatte.py b/tests/models/vitmatte/test_image_processing_vitmatte.py index b100fb3c30b6..2802aad7eb4d 100644 --- a/tests/models/vitmatte/test_image_processing_vitmatte.py +++ b/tests/models/vitmatte/test_image_processing_vitmatte.py @@ -126,8 +126,6 @@ def test_image_processor_properties(self): self.assertTrue(hasattr(image_processing, "rescale_factor")) self.assertTrue(hasattr(image_processing, "do_pad")) self.assertTrue(hasattr(image_processing, "size_divisor")) - # Check size_divisibility for BC, the image proccessor has to have an atribute - self.assertTrue(hasattr(image_processing, "size_divisibility")) def test_call_numpy(self): # create random numpy tensors diff --git a/tests/models/yolos/test_image_processing_yolos.py b/tests/models/yolos/test_image_processing_yolos.py index 2ef2f197aecb..18cdb8c867ba 100644 --- a/tests/models/yolos/test_image_processing_yolos.py +++ b/tests/models/yolos/test_image_processing_yolos.py @@ -171,11 +171,8 @@ def test_image_processor_from_dict_with_kwargs(self): self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333}) self.assertEqual(image_processor.do_pad, True) - image_processor = image_processing_class.from_dict( - self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False - ) - self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84}) - self.assertEqual(image_processor.do_pad, False) + image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42) + self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 1333}) def test_equivalence_padding(self): # Initialize image_processings