diff --git a/docs/source/en/model_doc/layoutxlm.md b/docs/source/en/model_doc/layoutxlm.md index 19051f55b683..9cedc7bad2d5 100644 --- a/docs/source/en/model_doc/layoutxlm.md +++ b/docs/source/en/model_doc/layoutxlm.md @@ -70,6 +70,12 @@ data for the model. As LayoutXLM's architecture is equivalent to that of LayoutLMv2, one can refer to [LayoutLMv2's documentation page](layoutlmv2) for all tips, code examples and notebooks. + +## LayoutXLMConfig + +[[autodoc]] LayoutXLMConfig + + ## LayoutXLMTokenizer [[autodoc]] LayoutXLMTokenizer diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index c55980e471c7..5f6274edca3e 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -222,7 +222,7 @@ ("layoutlm", "LayoutLMConfig"), ("layoutlmv2", "LayoutLMv2Config"), ("layoutlmv3", "LayoutLMv3Config"), - ("layoutxlm", "LayoutLMv2Config"), + ("layoutxlm", "LayoutXLMConfig"), ("led", "LEDConfig"), ("levit", "LevitConfig"), ("lfm2", "Lfm2Config"), @@ -915,12 +915,14 @@ [ ("audioflamingo3_encoder", "audioflamingo3"), ("openai-gpt", "openai"), + ("blip-2", "blip_2"), ("data2vec-audio", "data2vec"), ("data2vec-text", "data2vec"), ("data2vec-vision", "data2vec"), ("donut-swin", "donut"), ("kosmos-2", "kosmos2"), ("kosmos-2.5", "kosmos2_5"), + ("omdet-turbo", "omdet_turbo"), ("maskformer-swin", "maskformer"), ("xclip", "x_clip"), ("clip_vision_model", "clip"), @@ -936,7 +938,10 @@ ("glm4v_moe_vision", "glm4v_moe"), ("glm4v_text", "glm4v"), ("glm4v_moe_text", "glm4v_moe"), + ("grounding-dino", "grounding_dino"), + ("mm-grounding-dino", "mm_grounding_dino"), ("idefics3_vision", "idefics3"), + ("mgp-str", "mgp_str"), ("siglip_vision_model", "siglip"), ("siglip2_vision_model", "siglip2"), ("aimv2_vision_model", "aimv2"), @@ -962,6 +967,7 @@ ("video_llama_3_vision", "video_llama_3"), ("parakeet_encoder", "parakeet"), ("parakeet_ctc", "parakeet"), + ("wav2vec2-bert", "wav2vec2_bert"), ] ) diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index c4d6eb4a4c96..90604b5ff436 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -130,7 +130,7 @@ ("levit", ("LevitImageProcessor", "LevitImageProcessorFast")), ("lfm2_vl", (None, "Lfm2VlImageProcessorFast")), ("lightglue", ("LightGlueImageProcessor", "LightGlueImageProcessorFast")), - ("llama4", ("Llama4ImageProcessor", "Llama4ImageProcessorFast")), + ("llama4", (None, "Llama4ImageProcessorFast")), ("llava", ("LlavaImageProcessor", "LlavaImageProcessorFast")), ("llava_next", ("LlavaNextImageProcessor", "LlavaNextImageProcessorFast")), ("llava_next_video", ("LlavaNextImageProcessor", "LlavaNextImageProcessorFast")), diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index 9e6f4e66ff4d..d89faee4c13a 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -95,6 +95,7 @@ ("kyutai_speech_to_text", "KyutaiSpeechToTextProcessor"), ("layoutlmv2", "LayoutLMv2Processor"), ("layoutlmv3", "LayoutLMv3Processor"), + ("layoutxlm", "LayoutXLMProcessor"), ("lfm2_vl", "Lfm2VlProcessor"), ("llama4", "Llama4Processor"), ("llava", "LlavaProcessor"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 5edda2f5be8c..523ae7ff0b31 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -393,7 +393,7 @@ ("llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("llava_next", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("llava_next_video", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), - ("llava_onevision", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ("llava_onevision", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)), ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)), ( "longt5", diff --git a/src/transformers/models/edgetam/modeling_edgetam.py b/src/transformers/models/edgetam/modeling_edgetam.py index b23370e47f14..59979abbcf69 100644 --- a/src/transformers/models/edgetam/modeling_edgetam.py +++ b/src/transformers/models/edgetam/modeling_edgetam.py @@ -1103,7 +1103,7 @@ def forward( >>> # Postprocess masks >>> masks = processor.post_process_masks( - ... outputs.pred_masks, inputs["original_sizes"], inputs["reshaped_input_sizes"] + ... outputs.pred_masks, inputs["original_sizes"] ... ) ``` """ diff --git a/src/transformers/models/gemma3n/processing_gemma3n.py b/src/transformers/models/gemma3n/processing_gemma3n.py index 51b686557ed0..7c2c244b471b 100644 --- a/src/transformers/models/gemma3n/processing_gemma3n.py +++ b/src/transformers/models/gemma3n/processing_gemma3n.py @@ -147,5 +147,13 @@ def __call__( text_inputs["token_type_ids"] = token_type_ids.tolist() return BatchFeature(data={**text_inputs, **image_inputs, **audio_inputs}, tensor_type=return_tensors) + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + ["token_type_ids"] + image_processor_input_names = self.image_processor.model_input_names + audio_processor_input_names = self.feature_extractor.model_input_names + image_processor_input_names = [name for name in image_processor_input_names if name != "num_crops"] + return list(tokenizer_input_names + image_processor_input_names + audio_processor_input_names) + __all__ = ["Gemma3nProcessor"] diff --git a/src/transformers/models/glm46v/modeling_glm46v.py b/src/transformers/models/glm46v/modeling_glm46v.py index 2f00d22fb040..e3da448e79fd 100644 --- a/src/transformers/models/glm46v/modeling_glm46v.py +++ b/src/transformers/models/glm46v/modeling_glm46v.py @@ -562,8 +562,6 @@ def forward( The temporal, height and width of feature shape of each image in LLM. video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): The temporal, height and width of feature shape of each video in LLM. - rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*): - The rope index difference between sequence length and multimodal rope. Example: diff --git a/src/transformers/models/glm4v/modeling_glm4v.py b/src/transformers/models/glm4v/modeling_glm4v.py index ff5e0a00cc0d..2589e016c756 100644 --- a/src/transformers/models/glm4v/modeling_glm4v.py +++ b/src/transformers/models/glm4v/modeling_glm4v.py @@ -1410,8 +1410,6 @@ def forward( The temporal, height and width of feature shape of each image in LLM. video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): The temporal, height and width of feature shape of each video in LLM. - rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*): - The rope index difference between sequence length and multimodal rope. Example: diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py index dce3ef92c996..ddace4e5855b 100644 --- a/src/transformers/models/glm4v/modular_glm4v.py +++ b/src/transformers/models/glm4v/modular_glm4v.py @@ -1350,8 +1350,6 @@ def forward( The temporal, height and width of feature shape of each image in LLM. video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): The temporal, height and width of feature shape of each video in LLM. - rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*): - The rope index difference between sequence length and multimodal rope. Example: diff --git a/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py b/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py index 373d49bc942c..9537d9018838 100644 --- a/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py @@ -1630,8 +1630,6 @@ def forward( The temporal, height and width of feature shape of each image in LLM. video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): The temporal, height and width of feature shape of each video in LLM. - rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*): - The rope index difference between sequence length and multimodal rope. Example: diff --git a/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py b/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py index b204f769ec09..3ce3c0e7f1ae 100644 --- a/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py @@ -39,7 +39,7 @@ class LayoutLMv2Config(PreTrainedConfig): Args: vocab_size (`int`, *optional*, defaults to 30522): Vocabulary size of the LayoutLMv2 model. Defines the number of different tokens that can be represented by - the `inputs_ids` passed when calling [`LayoutLMv2Model`] or [`TFLayoutLMv2Model`]. + the `inputs_ids` passed when calling [`LayoutLMv2Model`]. hidden_size (`int`, *optional*, defaults to 768): Dimension of the encoder layers and the pooler layer. num_hidden_layers (`int`, *optional*, defaults to 12): @@ -59,12 +59,13 @@ class LayoutLMv2Config(PreTrainedConfig): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). type_vocab_size (`int`, *optional*, defaults to 2): - The vocabulary size of the `token_type_ids` passed when calling [`LayoutLMv2Model`] or - [`TFLayoutLMv2Model`]. + The vocabulary size of the `token_type_ids` passed when calling [`LayoutLMv2Model`]. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. max_2d_position_embeddings (`int`, *optional*, defaults to 1024): The maximum value that the 2D position embedding might ever be used with. Typically set this to something large just in case (e.g., 1024). @@ -78,7 +79,9 @@ class LayoutLMv2Config(PreTrainedConfig): The maximum number of relative 2D positions in the self-attention mechanism. rel_2d_pos_bins (`int`, *optional*, defaults to 64): The number of 2D relative position bins in the self-attention mechanism. - image_feature_pool_shape (`list[int]`, *optional*, defaults to [7, 7, 256]): + convert_sync_batchnorm (`bool`, *optional*, defaults to `True`): + Whether or not to convert batch normalization layers to synchronized batch normalization layers. + image_feature_pool_shape (`list[int]`, *optional*, defaults to `[7, 7, 256]`): The shape of the average-pooled feature map. coordinate_size (`int`, *optional*, defaults to 128): Dimension of the coordinate embeddings. @@ -95,6 +98,7 @@ class LayoutLMv2Config(PreTrainedConfig): file](https://github.com/microsoft/unilm/blob/master/layoutlmft/layoutlmft/models/layoutlmv2/detectron2_config.py) for details regarding default values. + Example: ```python diff --git a/src/transformers/models/layoutxlm/__init__.py b/src/transformers/models/layoutxlm/__init__.py index 6f5a662e6afd..9b338ce14185 100644 --- a/src/transformers/models/layoutxlm/__init__.py +++ b/src/transformers/models/layoutxlm/__init__.py @@ -18,6 +18,7 @@ if TYPE_CHECKING: + from .configuration_layoutxlm import * from .processing_layoutxlm import * from .tokenization_layoutxlm import * from .tokenization_layoutxlm_fast import * diff --git a/src/transformers/models/layoutxlm/configuration_layoutxlm.py b/src/transformers/models/layoutxlm/configuration_layoutxlm.py new file mode 100644 index 000000000000..e232c4d6ce74 --- /dev/null +++ b/src/transformers/models/layoutxlm/configuration_layoutxlm.py @@ -0,0 +1,228 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/layoutxlm/modular_layoutxlm.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_layoutxlm.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright Microsoft Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from ...configuration_utils import PreTrainedConfig +from ...utils import is_detectron2_available + + +# soft dependency +if is_detectron2_available(): + import detectron2 + + +class LayoutXLMConfig(PreTrainedConfig): + r""" + This is the configuration class to store the configuration of a [`LayoutXLMModel`]. It is used to instantiate an + LayoutXLM model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the LayoutXLM + [microsoft/layoutxlm-base](https://huggingface.co/microsoft/layoutxlm-base) architecture. + + Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PreTrainedConfig`] for more information. + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the LayoutXLM model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`LayoutXLMModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimension of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`LayoutXLMModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + max_2d_position_embeddings (`int`, *optional*, defaults to 1024): + The maximum value that the 2D position embedding might ever be used with. Typically set this to something + large just in case (e.g., 1024). + max_rel_pos (`int`, *optional*, defaults to 128): + The maximum number of relative positions to be used in the self-attention mechanism. + rel_pos_bins (`int`, *optional*, defaults to 32): + The number of relative position bins to be used in the self-attention mechanism. + fast_qkv (`bool`, *optional*, defaults to `True`): + Whether or not to use a single matrix for the queries, keys, values in the self-attention layers. + max_rel_2d_pos (`int`, *optional*, defaults to 256): + The maximum number of relative 2D positions in the self-attention mechanism. + rel_2d_pos_bins (`int`, *optional*, defaults to 64): + The number of 2D relative position bins in the self-attention mechanism. + convert_sync_batchnorm (`bool`, *optional*, defaults to `True`): + Whether or not to convert batch normalization layers to synchronized batch normalization layers. + image_feature_pool_shape (`list[int]`, *optional*, defaults to `[7, 7, 256]`): + The shape of the average-pooled feature map. + coordinate_size (`int`, *optional*, defaults to 128): + Dimension of the coordinate embeddings. + shape_size (`int`, *optional*, defaults to 128): + Dimension of the width and height embeddings. + has_relative_attention_bias (`bool`, *optional*, defaults to `True`): + Whether or not to use a relative attention bias in the self-attention mechanism. + has_spatial_attention_bias (`bool`, *optional*, defaults to `True`): + Whether or not to use a spatial attention bias in the self-attention mechanism. + has_visual_segment_embedding (`bool`, *optional*, defaults to `False`): + Whether or not to add visual segment embeddings. + detectron2_config_args (`dict`, *optional*): + Dictionary containing the configuration arguments of the Detectron2 visual backbone. Refer to [this + file](https://github.com/microsoft/unilm/blob/master/layoutlmft/layoutlmft/models/layoutxlm/detectron2_config.py) + for details regarding default values. + + + Example: + + ```python + >>> from transformers import LayoutXLMConfig, LayoutXLMModel + + >>> # Initializing a LayoutXLM microsoft/layoutxlm-base style configuration + >>> configuration = LayoutXLMConfig() + + >>> # Initializing a model (with random weights) from the microsoft/layoutxlm-base style configuration + >>> model = LayoutXLMModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "layoutxlm" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + max_2d_position_embeddings=1024, + max_rel_pos=128, + rel_pos_bins=32, + fast_qkv=True, + max_rel_2d_pos=256, + rel_2d_pos_bins=64, + convert_sync_batchnorm=True, + image_feature_pool_shape=[7, 7, 256], + coordinate_size=128, + shape_size=128, + has_relative_attention_bias=True, + has_spatial_attention_bias=True, + has_visual_segment_embedding=False, + detectron2_config_args=None, + **kwargs, + ): + super().__init__( + vocab_size=vocab_size, + hidden_size=hidden_size, + num_hidden_layers=num_hidden_layers, + num_attention_heads=num_attention_heads, + intermediate_size=intermediate_size, + hidden_act=hidden_act, + hidden_dropout_prob=hidden_dropout_prob, + attention_probs_dropout_prob=attention_probs_dropout_prob, + max_position_embeddings=max_position_embeddings, + type_vocab_size=type_vocab_size, + initializer_range=initializer_range, + layer_norm_eps=layer_norm_eps, + pad_token_id=pad_token_id, + **kwargs, + ) + self.max_2d_position_embeddings = max_2d_position_embeddings + self.max_rel_pos = max_rel_pos + self.rel_pos_bins = rel_pos_bins + self.fast_qkv = fast_qkv + self.max_rel_2d_pos = max_rel_2d_pos + self.rel_2d_pos_bins = rel_2d_pos_bins + self.convert_sync_batchnorm = convert_sync_batchnorm + self.image_feature_pool_shape = image_feature_pool_shape + self.coordinate_size = coordinate_size + self.shape_size = shape_size + self.has_relative_attention_bias = has_relative_attention_bias + self.has_spatial_attention_bias = has_spatial_attention_bias + self.has_visual_segment_embedding = has_visual_segment_embedding + self.detectron2_config_args = ( + detectron2_config_args if detectron2_config_args is not None else self.get_default_detectron2_config() + ) + + @classmethod + def get_default_detectron2_config(cls): + return { + "MODEL.MASK_ON": True, + "MODEL.PIXEL_STD": [57.375, 57.120, 58.395], + "MODEL.BACKBONE.NAME": "build_resnet_fpn_backbone", + "MODEL.FPN.IN_FEATURES": ["res2", "res3", "res4", "res5"], + "MODEL.ANCHOR_GENERATOR.SIZES": [[32], [64], [128], [256], [512]], + "MODEL.RPN.IN_FEATURES": ["p2", "p3", "p4", "p5", "p6"], + "MODEL.RPN.PRE_NMS_TOPK_TRAIN": 2000, + "MODEL.RPN.PRE_NMS_TOPK_TEST": 1000, + "MODEL.RPN.POST_NMS_TOPK_TRAIN": 1000, + "MODEL.POST_NMS_TOPK_TEST": 1000, + "MODEL.ROI_HEADS.NAME": "StandardROIHeads", + "MODEL.ROI_HEADS.NUM_CLASSES": 5, + "MODEL.ROI_HEADS.IN_FEATURES": ["p2", "p3", "p4", "p5"], + "MODEL.ROI_BOX_HEAD.NAME": "FastRCNNConvFCHead", + "MODEL.ROI_BOX_HEAD.NUM_FC": 2, + "MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION": 14, + "MODEL.ROI_MASK_HEAD.NAME": "MaskRCNNConvUpsampleHead", + "MODEL.ROI_MASK_HEAD.NUM_CONV": 4, + "MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION": 7, + "MODEL.RESNETS.DEPTH": 101, + "MODEL.RESNETS.SIZES": [[32], [64], [128], [256], [512]], + "MODEL.RESNETS.ASPECT_RATIOS": [[0.5, 1.0, 2.0]], + "MODEL.RESNETS.OUT_FEATURES": ["res2", "res3", "res4", "res5"], + "MODEL.RESNETS.NUM_GROUPS": 32, + "MODEL.RESNETS.WIDTH_PER_GROUP": 8, + "MODEL.RESNETS.STRIDE_IN_1X1": False, + } + + def get_detectron2_config(self): + detectron2_config = detectron2.config.get_cfg() + for k, v in self.detectron2_config_args.items(): + attributes = k.split(".") + to_set = detectron2_config + for attribute in attributes[:-1]: + to_set = getattr(to_set, attribute) + setattr(to_set, attributes[-1], v) + + return detectron2_config + + +__all__ = ["LayoutXLMConfig"] diff --git a/src/transformers/models/layoutxlm/modular_layoutxlm.py b/src/transformers/models/layoutxlm/modular_layoutxlm.py new file mode 100644 index 000000000000..a6afacf7a650 --- /dev/null +++ b/src/transformers/models/layoutxlm/modular_layoutxlm.py @@ -0,0 +1,109 @@ +# coding=utf-8 +# Copyright Microsoft Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..layoutlmv2.configuration_layoutlmv2 import LayoutLMv2Config + + +class LayoutXLMConfig(LayoutLMv2Config): + r""" + This is the configuration class to store the configuration of a [`LayoutXLMModel`]. It is used to instantiate an + LayoutXLM model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the LayoutXLM + [microsoft/layoutxlm-base](https://huggingface.co/microsoft/layoutxlm-base) architecture. + + Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PreTrainedConfig`] for more information. + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the LayoutXLM model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`LayoutXLMModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimension of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`LayoutXLMModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + max_2d_position_embeddings (`int`, *optional*, defaults to 1024): + The maximum value that the 2D position embedding might ever be used with. Typically set this to something + large just in case (e.g., 1024). + max_rel_pos (`int`, *optional*, defaults to 128): + The maximum number of relative positions to be used in the self-attention mechanism. + rel_pos_bins (`int`, *optional*, defaults to 32): + The number of relative position bins to be used in the self-attention mechanism. + fast_qkv (`bool`, *optional*, defaults to `True`): + Whether or not to use a single matrix for the queries, keys, values in the self-attention layers. + max_rel_2d_pos (`int`, *optional*, defaults to 256): + The maximum number of relative 2D positions in the self-attention mechanism. + rel_2d_pos_bins (`int`, *optional*, defaults to 64): + The number of 2D relative position bins in the self-attention mechanism. + convert_sync_batchnorm (`bool`, *optional*, defaults to `True`): + Whether or not to convert batch normalization layers to synchronized batch normalization layers. + image_feature_pool_shape (`list[int]`, *optional*, defaults to `[7, 7, 256]`): + The shape of the average-pooled feature map. + coordinate_size (`int`, *optional*, defaults to 128): + Dimension of the coordinate embeddings. + shape_size (`int`, *optional*, defaults to 128): + Dimension of the width and height embeddings. + has_relative_attention_bias (`bool`, *optional*, defaults to `True`): + Whether or not to use a relative attention bias in the self-attention mechanism. + has_spatial_attention_bias (`bool`, *optional*, defaults to `True`): + Whether or not to use a spatial attention bias in the self-attention mechanism. + has_visual_segment_embedding (`bool`, *optional*, defaults to `False`): + Whether or not to add visual segment embeddings. + detectron2_config_args (`dict`, *optional*): + Dictionary containing the configuration arguments of the Detectron2 visual backbone. Refer to [this + file](https://github.com/microsoft/unilm/blob/master/layoutlmft/layoutlmft/models/layoutxlm/detectron2_config.py) + for details regarding default values. + + + Example: + + ```python + >>> from transformers import LayoutXLMConfig, LayoutXLMModel + + >>> # Initializing a LayoutXLM microsoft/layoutxlm-base style configuration + >>> configuration = LayoutXLMConfig() + + >>> # Initializing a model (with random weights) from the microsoft/layoutxlm-base style configuration + >>> model = LayoutXLMModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + pass + + +__all__ = ["LayoutXLMConfig"] diff --git a/src/transformers/models/llava_next/image_processing_llava_next_fast.py b/src/transformers/models/llava_next/image_processing_llava_next_fast.py index 19d6fb941e7b..cc5dc756b237 100644 --- a/src/transformers/models/llava_next/image_processing_llava_next_fast.py +++ b/src/transformers/models/llava_next/image_processing_llava_next_fast.py @@ -47,6 +47,7 @@ class LlavaNextImageProcessorFast(BaseImageProcessorFast): # To be checked against the slow image processor # None values left after checking can be removed + model_input_names = ["pixel_values", "image_sizes"] resample = PILImageResampling.BICUBIC image_mean = OPENAI_CLIP_MEAN image_std = OPENAI_CLIP_STD @@ -253,9 +254,7 @@ def _preprocess( ) processed_image_patches_grouped[shape] = stacked_image_patches processed_image_patches = reorder_images(processed_image_patches_grouped, grouped_image_patches_index) - processed_image_patches = ( - torch.stack(processed_image_patches, dim=0) if return_tensors else processed_image_patches - ) + processed_image_patches = torch.stack(processed_image_patches, dim=0) processed_images.append(processed_image_patches) image_sizes.append(get_image_size(image, ChannelDimension.FIRST)) diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py index b80b2b76b1a7..beb1c1b982e0 100644 --- a/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +++ b/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py @@ -47,6 +47,7 @@ @auto_docstring class LlavaOnevisionImageProcessorFast(BaseImageProcessorFast): + model_input_names = ["pixel_values", "image_sizes", "batch_num_images"] resample = PILImageResampling.BICUBIC image_mean = OPENAI_CLIP_MEAN image_std = OPENAI_CLIP_STD @@ -61,7 +62,6 @@ class LlavaOnevisionImageProcessorFast(BaseImageProcessorFast): do_pad = True image_grid_pinpoints = [[384, 384], [384, 768], [384, 1152], [384, 1536], [384, 1920], [384, 2304], [768, 384], [768, 768], [768, 1152], [768, 1536], [768, 1920], [768, 2304], [1152, 384], [1152, 768], [1152, 1152], [1152, 1536], [1152, 1920], [1152, 2304], [1536, 384], [1536, 768], [1536, 1152], [1536, 1536], [1536, 1920], [1536, 2304], [1920, 384], [1920, 768], [1920, 1152], [1920, 1536], [1920, 1920], [1920, 2304], [2304, 384], [2304, 768], [2304, 1152], [2304, 1536], [2304, 1920], [2304, 2304]] # fmt: skip valid_kwargs = LlavaOnevisionImageProcessorKwargs - model_input_names = ["pixel_values", "image_sizes", "batch_num_images"] def __init__(self, **kwargs: Unpack[LlavaOnevisionImageProcessorKwargs]): super().__init__(**kwargs) @@ -273,9 +273,7 @@ def _preprocess( ) processed_image_patches_grouped[shape] = stacked_image_patches processed_image_patches = reorder_images(processed_image_patches_grouped, grouped_image_patches_index) - processed_image_patches = ( - torch.stack(processed_image_patches, dim=0) if return_tensors else processed_image_patches - ) + processed_image_patches = torch.stack(processed_image_patches, dim=0) processed_images.append(processed_image_patches) image_sizes.append(get_image_size(image, ChannelDimension.FIRST)) diff --git a/src/transformers/models/llava_onevision/modular_llava_onevision.py b/src/transformers/models/llava_onevision/modular_llava_onevision.py index 88d1c10ab122..dd714def07c2 100644 --- a/src/transformers/models/llava_onevision/modular_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modular_llava_onevision.py @@ -205,9 +205,7 @@ def _preprocess( ) processed_image_patches_grouped[shape] = stacked_image_patches processed_image_patches = reorder_images(processed_image_patches_grouped, grouped_image_patches_index) - processed_image_patches = ( - torch.stack(processed_image_patches, dim=0) if return_tensors else processed_image_patches - ) + processed_image_patches = torch.stack(processed_image_patches, dim=0) processed_images.append(processed_image_patches) image_sizes.append(get_image_size(image, ChannelDimension.FIRST)) diff --git a/src/transformers/models/mgp_str/processing_mgp_str.py b/src/transformers/models/mgp_str/processing_mgp_str.py index 7686b43f00e8..87cee3210fd3 100644 --- a/src/transformers/models/mgp_str/processing_mgp_str.py +++ b/src/transformers/models/mgp_str/processing_mgp_str.py @@ -210,5 +210,10 @@ def wp_decode(self, sequences): decode_strs = [seq.replace(" ", "") for seq in self.wp_tokenizer.batch_decode(sequences)] return decode_strs + @property + def model_input_names(self): + image_processor_input_names = self.image_processor.model_input_names + return image_processor_input_names + ["labels"] + __all__ = ["MgpstrProcessor"] diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py index 3c6dd617c214..7998f9b045ea 100644 --- a/src/transformers/models/owlvit/processing_owlvit.py +++ b/src/transformers/models/owlvit/processing_owlvit.py @@ -47,7 +47,7 @@ class OwlViTProcessorKwargs(ProcessingKwargs, total=False): "padding": "max_length", }, "common_kwargs": { - "return_tensors": "np", + "return_tensors": "pt", }, } diff --git a/src/transformers/models/sam/image_processing_sam_fast.py b/src/transformers/models/sam/image_processing_sam_fast.py index 54dbcf52c17f..fa824daee4be 100644 --- a/src/transformers/models/sam/image_processing_sam_fast.py +++ b/src/transformers/models/sam/image_processing_sam_fast.py @@ -27,6 +27,10 @@ from ...image_processing_utils import BatchFeature, get_size_dict from ...image_processing_utils_fast import BaseImageProcessorFast +from ...image_transforms import ( + group_images_by_shape, + reorder_images, +) from ...image_utils import ( IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, @@ -37,7 +41,10 @@ pil_torch_interpolation_mapping, ) from ...processing_utils import Unpack -from ...utils import auto_docstring +from ...utils import ( + TensorType, + auto_docstring, +) from .image_processing_sam import SamImageProcessorKwargs @@ -182,12 +189,11 @@ def _preprocess_image_like_inputs( ) original_sizes = [image.shape[-2:] for image in images] images_kwargs = kwargs.copy() - pixel_values = self._preprocess(images, **images_kwargs)["pixel_values"] - reshaped_input_sizes = [image.shape[-2:] for image in images] + image_outputs = self._preprocess(images, **images_kwargs) data = { - "pixel_values": pixel_values, + "pixel_values": image_outputs.pixel_values, "original_sizes": original_sizes, - "reshaped_input_sizes": reshaped_input_sizes, + "reshaped_input_sizes": image_outputs.reshaped_input_sizes, } if segmentation_maps is not None: @@ -215,6 +221,58 @@ def _preprocess_image_like_inputs( return BatchFeature(data=data, tensor_type=kwargs["return_tensors"]) + def _preprocess( + self, + images: list["torch.Tensor"], + do_resize: bool, + size: SizeDict, + interpolation: Optional["F.InterpolationMode"], + do_center_crop: bool, + crop_size: SizeDict, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + do_pad: Optional[bool], + pad_size: Optional[SizeDict], + disable_grouping: Optional[bool], + return_tensors: Optional[Union[str, TensorType]], + **kwargs, + ) -> BatchFeature: + # Group images by size for batched resizing + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + resized_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_resize: + stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation) + resized_images_grouped[shape] = stacked_images + resized_images = reorder_images(resized_images_grouped, grouped_images_index) + reshaped_input_sizes = [image.shape[-2:] for image in resized_images] + + # Group images by size for further processing + # Needed in case do_resize is False, or resize returns images with different sizes + grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) + processed_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_center_crop: + stacked_images = self.center_crop(stacked_images, crop_size) + # Fused rescale and normalize + stacked_images = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + processed_images_grouped[shape] = stacked_images + processed_images = reorder_images(processed_images_grouped, grouped_images_index) + + if do_pad: + processed_images = self.pad(processed_images, pad_size=pad_size, disable_grouping=disable_grouping) + + processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images + return BatchFeature( + data={"pixel_values": processed_images, "reshaped_input_sizes": reshaped_input_sizes}, + tensor_type=return_tensors, + ) + def generate_crop_boxes( self, image: "torch.Tensor", @@ -378,7 +436,7 @@ def post_process_masks( (`torch.Tensor`): Batched masks in batch_size, num_channels, height, width) format, where (height, width) is given by original_size. """ - pad_size = self.size if pad_size is None else pad_size + pad_size = self.pad_size if pad_size is None else pad_size target_image_size = (pad_size["height"], pad_size["width"]) if isinstance(original_sizes, (torch.Tensor, np.ndarray)): original_sizes = original_sizes.tolist() diff --git a/src/transformers/models/sam2/image_processing_sam2_fast.py b/src/transformers/models/sam2/image_processing_sam2_fast.py index 68b9a55f04fd..81fd58a46700 100644 --- a/src/transformers/models/sam2/image_processing_sam2_fast.py +++ b/src/transformers/models/sam2/image_processing_sam2_fast.py @@ -492,6 +492,14 @@ def _preprocess_image_like_inputs( return BatchFeature(data=data, tensor_type=kwargs["return_tensors"]) + def _preprocess( + self, + images: list["torch.Tensor"], + return_tensors: Optional[Union[str, TensorType]], + **kwargs, + ) -> "torch.Tensor": + return super()._preprocess(images, return_tensors=return_tensors, **kwargs).pixel_values + def generate_crop_boxes( self, image: "torch.Tensor", @@ -693,14 +701,6 @@ def post_process_for_mask_generation(self, all_masks, all_scores, all_boxes, cro """ return _post_process_for_mask_generation(all_masks, all_scores, all_boxes, crops_nms_thresh) - def _preprocess( - self, - images: list["torch.Tensor"], - return_tensors: Optional[Union[str, TensorType]], - **kwargs, - ) -> "torch.Tensor": - return super()._preprocess(images, return_tensors=return_tensors, **kwargs).pixel_values - def _apply_non_overlapping_constraints(self, pred_masks: torch.Tensor) -> torch.Tensor: """ Apply non-overlapping constraints to the object scores in pred_masks. Here we diff --git a/src/transformers/models/sam2/modeling_sam2.py b/src/transformers/models/sam2/modeling_sam2.py index 8e62d6d99e76..39a091d7b2a4 100644 --- a/src/transformers/models/sam2/modeling_sam2.py +++ b/src/transformers/models/sam2/modeling_sam2.py @@ -1462,7 +1462,7 @@ def forward( >>> # Postprocess masks >>> masks = processor.post_process_masks( - ... outputs.pred_masks, inputs["original_sizes"], inputs["reshaped_input_sizes"] + ... outputs.pred_masks, inputs["original_sizes"] ... ) ``` """ diff --git a/src/transformers/models/sam2/modular_sam2.py b/src/transformers/models/sam2/modular_sam2.py index 0c40c989fe00..a564a2b4dbea 100644 --- a/src/transformers/models/sam2/modular_sam2.py +++ b/src/transformers/models/sam2/modular_sam2.py @@ -1370,7 +1370,7 @@ def forward( >>> # Postprocess masks >>> masks = processor.post_process_masks( - ... outputs.pred_masks, inputs["original_sizes"], inputs["reshaped_input_sizes"] + ... outputs.pred_masks, inputs["original_sizes"] ... ) ``` """ diff --git a/src/transformers/models/sam2/processing_sam2.py b/src/transformers/models/sam2/processing_sam2.py index 05dbe7347edd..4c1854aef2ff 100644 --- a/src/transformers/models/sam2/processing_sam2.py +++ b/src/transformers/models/sam2/processing_sam2.py @@ -518,5 +518,10 @@ def post_process_masks( **kwargs, ) + @property + def model_input_names(self): + image_processor_input_names = self.image_processor.model_input_names + return list(image_processor_input_names + ["original_sizes"]) + __all__ = ["Sam2Processor"] diff --git a/src/transformers/models/sam2_video/processing_sam2_video.py b/src/transformers/models/sam2_video/processing_sam2_video.py index 0583e820e3bc..e8fd04ba1386 100644 --- a/src/transformers/models/sam2_video/processing_sam2_video.py +++ b/src/transformers/models/sam2_video/processing_sam2_video.py @@ -521,6 +521,11 @@ def post_process_masks( **kwargs, ) + @property + def model_input_names(self): + image_processor_input_names = self.image_processor.model_input_names + return list(image_processor_input_names + ["original_sizes"]) + def init_video_session( self, video: Optional[VideoInput] = None, diff --git a/src/transformers/models/sam2_video/video_processing_sam2_video.py b/src/transformers/models/sam2_video/video_processing_sam2_video.py index 873bf2c378ab..be5b8c991c7c 100644 --- a/src/transformers/models/sam2_video/video_processing_sam2_video.py +++ b/src/transformers/models/sam2_video/video_processing_sam2_video.py @@ -18,7 +18,7 @@ import numpy as np import torch -from torch.nn import functional as F_t +import torch.nn.functional as F from ...image_processing_utils import BatchFeature from ...image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling, SizeDict @@ -35,6 +35,7 @@ class Sam2VideoVideoProcessor(BaseVideoProcessor): do_rescale = True do_normalize = True do_convert_rgb = True + model_input_names = ["pixel_values"] def _preprocess( self, @@ -93,9 +94,9 @@ def post_process_masks( masks[i] = torch.from_numpy(masks[i]) elif not isinstance(masks[i], torch.Tensor): raise TypeError("Input masks should be a list of `torch.tensors` or a list of `np.ndarray`") - interpolated_mask = F_t.interpolate(masks[i], target_image_size, mode="bilinear", align_corners=False) + interpolated_mask = F.interpolate(masks[i], target_image_size, mode="bilinear", align_corners=False) interpolated_mask = interpolated_mask[..., : reshaped_input_sizes[i][0], : reshaped_input_sizes[i][1]] - interpolated_mask = F_t.interpolate(interpolated_mask, original_size, mode="bilinear", align_corners=False) + interpolated_mask = F.interpolate(interpolated_mask, original_size, mode="bilinear", align_corners=False) if binarize: interpolated_mask = interpolated_mask > mask_threshold output_masks.append(interpolated_mask) diff --git a/src/transformers/models/sam3/image_processing_sam3_fast.py b/src/transformers/models/sam3/image_processing_sam3_fast.py index 90089a334bbb..656824703a7b 100644 --- a/src/transformers/models/sam3/image_processing_sam3_fast.py +++ b/src/transformers/models/sam3/image_processing_sam3_fast.py @@ -522,6 +522,14 @@ def _preprocess_image_like_inputs( return BatchFeature(data=data, tensor_type=kwargs["return_tensors"]) + def _preprocess( + self, + images: list["torch.Tensor"], + return_tensors: Optional[Union[str, TensorType]], + **kwargs, + ) -> "torch.Tensor": + return super()._preprocess(images, return_tensors=return_tensors, **kwargs).pixel_values + def generate_crop_boxes( self, image: "torch.Tensor", @@ -723,14 +731,6 @@ def post_process_for_mask_generation(self, all_masks, all_scores, all_boxes, cro """ return _post_process_for_mask_generation(all_masks, all_scores, all_boxes, crops_nms_thresh) - def _preprocess( - self, - images: list["torch.Tensor"], - return_tensors: Optional[Union[str, TensorType]], - **kwargs, - ) -> "torch.Tensor": - return super()._preprocess(images, return_tensors=return_tensors, **kwargs).pixel_values - def _apply_non_overlapping_constraints(self, pred_masks: torch.Tensor) -> torch.Tensor: """ Apply non-overlapping constraints to the object scores in pred_masks. Here we diff --git a/src/transformers/models/sam3_tracker/modeling_sam3_tracker.py b/src/transformers/models/sam3_tracker/modeling_sam3_tracker.py index d89ea6001048..f3d36e33fe5d 100644 --- a/src/transformers/models/sam3_tracker/modeling_sam3_tracker.py +++ b/src/transformers/models/sam3_tracker/modeling_sam3_tracker.py @@ -959,7 +959,7 @@ def forward( >>> # Postprocess masks >>> masks = processor.post_process_masks( - ... outputs.pred_masks, inputs["original_sizes"], inputs["reshaped_input_sizes"] + ... outputs.pred_masks, inputs["original_sizes"] ... ) ``` """ diff --git a/src/transformers/models/sam3_tracker/processing_sam3_tracker.py b/src/transformers/models/sam3_tracker/processing_sam3_tracker.py index 96e123913936..6cbb399597a0 100644 --- a/src/transformers/models/sam3_tracker/processing_sam3_tracker.py +++ b/src/transformers/models/sam3_tracker/processing_sam3_tracker.py @@ -517,5 +517,10 @@ def post_process_masks( **kwargs, ) + @property + def model_input_names(self): + image_processor_input_names = self.image_processor.model_input_names + return list(image_processor_input_names + ["original_sizes"]) + __all__ = ["Sam3TrackerProcessor"] diff --git a/src/transformers/models/sam3_tracker_video/processing_sam3_tracker_video.py b/src/transformers/models/sam3_tracker_video/processing_sam3_tracker_video.py index 7ca6b97518cd..5659eeb4e5d8 100644 --- a/src/transformers/models/sam3_tracker_video/processing_sam3_tracker_video.py +++ b/src/transformers/models/sam3_tracker_video/processing_sam3_tracker_video.py @@ -522,6 +522,11 @@ def post_process_masks( **kwargs, ) + @property + def model_input_names(self): + image_processor_input_names = self.image_processor.model_input_names + return list(image_processor_input_names + ["original_sizes"]) + def init_video_session( self, video: Optional[VideoInput] = None, diff --git a/src/transformers/models/sam_hq/processing_samhq.py b/src/transformers/models/sam_hq/processing_samhq.py index 1434a9ca5a2d..80230631c3c1 100644 --- a/src/transformers/models/sam_hq/processing_samhq.py +++ b/src/transformers/models/sam_hq/processing_samhq.py @@ -93,6 +93,7 @@ def __call__( input_points = output_kwargs["images_kwargs"].pop("input_points", None) input_labels = output_kwargs["images_kwargs"].pop("input_labels", None) input_boxes = output_kwargs["images_kwargs"].pop("input_boxes", None) + point_pad_value = output_kwargs["images_kwargs"].pop("point_pad_value", None) encoding_image_processor = self.image_processor( images, @@ -117,7 +118,7 @@ def __call__( input_labels=input_labels, input_boxes=input_boxes, return_tensors=output_kwargs["images_kwargs"].get("return_tensors"), - point_pad_value=output_kwargs["images_kwargs"].get("point_pad_value"), + point_pad_value=point_pad_value, ) return encoding_image_processor diff --git a/tests/models/align/test_processing_align.py b/tests/models/align/test_processing_align.py index bb799abdd243..84be4efa3483 100644 --- a/tests/models/align/test_processing_align.py +++ b/tests/models/align/test_processing_align.py @@ -12,15 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -import shutil -import tempfile import unittest -import pytest - -from transformers import BertTokenizer, BertTokenizerFast -from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES from transformers.testing_utils import require_vision from transformers.utils import is_vision_available @@ -28,16 +21,16 @@ if is_vision_available(): - from transformers import AlignProcessor, EfficientNetImageProcessor + from transformers import AlignProcessor @require_vision class AlignProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = AlignProcessor - def setUp(self): - self.tmpdirname = tempfile.mkdtemp() - + @classmethod + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") vocab_tokens = [ "[UNK]", "[CLS]", @@ -55,133 +48,22 @@ def setUp(self): "low", "lowest", ] - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: - vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) - - image_processor_map = { - "do_resize": True, - "size": 20, - "do_normalize": True, - "image_mean": [0.48145466, 0.4578275, 0.40821073], - "image_std": [0.26862954, 0.26130258, 0.27577711], - } - image_processor = EfficientNetImageProcessor(**image_processor_map) - processor = AlignProcessor(tokenizer=self.get_tokenizer(), image_processor=image_processor) - processor.save_pretrained(self.tmpdirname) - - image_processor = EfficientNetImageProcessor.from_pretrained(self.tmpdirname) - image_processor.save_pretrained(self.tmpdirname) - tokenizer = BertTokenizer.from_pretrained(self.tmpdirname) - tokenizer.save_pretrained(self.tmpdirname) - - def get_tokenizer(self, **kwargs): - return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs) - - def get_rust_tokenizer(self, **kwargs): - return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) - - def get_image_processor(self, **kwargs): - return EfficientNetImageProcessor.from_pretrained(self.tmpdirname, **kwargs) - - def tearDown(self): - shutil.rmtree(self.tmpdirname) - - def test_save_load_pretrained_default(self): - tokenizer_slow = self.get_tokenizer() - tokenizer_fast = self.get_rust_tokenizer() - image_processor = self.get_image_processor() - - processor_slow = AlignProcessor(tokenizer=tokenizer_slow, image_processor=image_processor) - processor_slow.save_pretrained(self.tmpdirname) - processor_slow = AlignProcessor.from_pretrained(self.tmpdirname, use_fast=False) - - processor_fast = AlignProcessor(tokenizer=tokenizer_fast, image_processor=image_processor) - processor_fast.save_pretrained(self.tmpdirname) - processor_fast = AlignProcessor.from_pretrained(self.tmpdirname) - - self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab()) - self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab()) - self.assertEqual(tokenizer_slow.get_vocab(), tokenizer_fast.get_vocab()) - self.assertIsInstance(processor_slow.tokenizer, BertTokenizer) - self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast) - - self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string()) - self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string()) - self.assertIsInstance(processor_slow.image_processor, EfficientNetImageProcessor) - self.assertIsInstance(processor_fast.image_processor, EfficientNetImageProcessor) - - def test_save_load_pretrained_additional_features(self): - processor = AlignProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()) - processor.save_pretrained(self.tmpdirname) - - tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") - image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0) - - processor = AlignProcessor.from_pretrained( - self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 + vocab_file = f"{cls.tmpdirname}/vocab.txt" + with open(vocab_file, "w", encoding="utf-8") as f: + f.write("\n".join(vocab_tokens)) + + tokenizer = tokenizer_class(vocab_file) + return tokenizer + + @classmethod + def _setup_image_processor(cls): + image_processor_class = cls._get_component_class_from_processor("image_processor") + + image_processor = image_processor_class( + do_resize=True, + size=20, + do_normalize=True, + image_mean=[0.48145466, 0.4578275, 0.40821073], + image_std=[0.26862954, 0.26130258, 0.27577711], ) - - self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) - self.assertIsInstance(processor.tokenizer, BertTokenizerFast) - - self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, EfficientNetImageProcessor) - - def test_image_processor(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor) - - image_input = self.prepare_image_inputs() - - input_image_proc = image_processor(image_input, return_tensors="np") - input_processor = processor(images=image_input, return_tensors="np") - - for key in input_image_proc: - self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2) - - def test_tokenizer(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor) - - input_str = "lower newer" - - encoded_processor = processor(text=input_str) - - encoded_tok = tokenizer(input_str, padding="max_length", max_length=64) - for key in encoded_tok: - self.assertListEqual(encoded_tok[key], encoded_processor[key]) - - def test_processor(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input) - - self.assertSetEqual(set(inputs.keys()), {"input_ids", "token_type_ids", "attention_mask", "pixel_values"}) - - # test if it raises when no input is passed - with pytest.raises(ValueError): - processor() - - def test_tokenizer_decode(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor) - - predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] - - decoded_processor = processor.batch_decode(predicted_ids) - decoded_tok = tokenizer.batch_decode(predicted_ids) - - self.assertListEqual(decoded_tok, decoded_processor) + return image_processor diff --git a/tests/models/altclip/test_processing_altclip.py b/tests/models/altclip/test_processing_altclip.py index f498ce4aa87c..d06850d86b1c 100644 --- a/tests/models/altclip/test_processing_altclip.py +++ b/tests/models/altclip/test_processing_altclip.py @@ -13,10 +13,9 @@ # limitations under the License. -import tempfile import unittest -from transformers import AltCLIPProcessor, CLIPImageProcessor, XLMRobertaTokenizer, XLMRobertaTokenizerFast +from transformers import AltCLIPProcessor from transformers.testing_utils import require_vision from ...test_processing_common import ProcessorTesterMixin @@ -25,23 +24,4 @@ @require_vision class AltClipProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = AltCLIPProcessor - - @classmethod - def setUpClass(cls): - cls.model_id = "BAAI/AltCLIP" - cls.tmpdirname = tempfile.mkdtemp() - image_processor = CLIPImageProcessor() - tokenizer = XLMRobertaTokenizer.from_pretrained(cls.model_id) - - processor = cls.processor_class(image_processor, tokenizer) - - processor.save_pretrained(cls.tmpdirname) - - def get_tokenizer(self, **kwargs): - return XLMRobertaTokenizer.from_pretrained(self.model_id, **kwargs) - - def get_rust_tokenizer(self, **kwargs): - return XLMRobertaTokenizerFast.from_pretrained(self.model_id, **kwargs) - - def get_image_processor(self, **kwargs): - return CLIPImageProcessor.from_pretrained(self.model_id, **kwargs) + model_id = "BAAI/AltCLIP" diff --git a/tests/models/aria/test_processing_aria.py b/tests/models/aria/test_processing_aria.py index 3ff8aad72cca..0fa5143da518 100644 --- a/tests/models/aria/test_processing_aria.py +++ b/tests/models/aria/test_processing_aria.py @@ -12,15 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import shutil -import tempfile import unittest import numpy as np from transformers import AriaProcessor from transformers.image_utils import load_image -from transformers.models.auto.processing_auto import AutoProcessor from transformers.testing_utils import require_torch, require_vision from ...test_processing_common import ProcessorTesterMixin, url_to_local_path @@ -29,13 +26,17 @@ @require_torch @require_vision class AriaProcessorTest(ProcessorTesterMixin, unittest.TestCase): + # NOTE: setUpClass, tearDownClass, and getter methods have been removed. + # They are now automatically handled by ProcessorTesterMixin. + # This test only needs: processor_class = YourProcessor + # Optionally: model_id = "some/model" to load from specific pretrained model + # Optionally: prepare_processor_dict() for custom processor kwargs. + processor_class = AriaProcessor + model_id = "m-ric/Aria_hf_2" @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - processor = AriaProcessor.from_pretrained("m-ric/Aria_hf_2", size_conversion={490: 2, 980: 2}) - processor.save_pretrained(cls.tmpdirname) + def _setup_test_attributes(cls, processor): cls.image1 = load_image( url_to_local_path( "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" @@ -72,23 +73,6 @@ def prepare_processor_dict(): "size_conversion": {490: 2, 980: 2}, } # fmt: skip - def get_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer - - def get_image_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor - - def get_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs) - - @classmethod - def tearDownClass(cls): - cls.image1.close() - cls.image2.close() - cls.image3.close() - shutil.rmtree(cls.tmpdirname, ignore_errors=True) - - # Copied from tests.models.llava.test_processing_llava.LlavaProcessorTest.test_get_num_vision_tokens def test_get_num_vision_tokens(self): "Tests general functionality of the helper used internally in vLLM" diff --git a/tests/models/aya_vision/test_processing_aya_vision.py b/tests/models/aya_vision/test_processing_aya_vision.py index ef77e7355a3b..8d4611eb2374 100644 --- a/tests/models/aya_vision/test_processing_aya_vision.py +++ b/tests/models/aya_vision/test_processing_aya_vision.py @@ -12,13 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import shutil -import tempfile import unittest -from transformers import AutoProcessor, AutoTokenizer, AyaVisionProcessor +from transformers import AyaVisionProcessor from transformers.testing_utils import require_torch, require_vision -from transformers.utils import is_torch_available, is_vision_available +from transformers.utils import is_torch_available from ...test_processing_common import ProcessorTesterMixin, url_to_local_path @@ -27,19 +25,24 @@ import torch -if is_vision_available(): - from transformers import GotOcr2ImageProcessor - - @require_vision class AyaVisionProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = AyaVisionProcessor + model_id = "hf-internal-testing/namespace-CohereForAI-repo_name_aya-vision-8b" @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() + def _setup_test_attributes(cls, processor): + cls.image_token = processor.image_token - image_processor = GotOcr2ImageProcessor( + @classmethod + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") + return tokenizer_class.from_pretrained(cls.model_id, padding_side="left") + + @classmethod + def _setup_image_processor(cls): + image_processor_class = cls._get_component_class_from_processor("image_processor") + return image_processor_class( do_resize=True, size={"height": 20, "width": 20}, max_patches=2, @@ -50,37 +53,15 @@ def setUpClass(cls): image_std=[0.229, 0.224, 0.225], do_convert_rgb=True, ) - tokenizer = AutoTokenizer.from_pretrained( - "hf-internal-testing/namespace-CohereForAI-repo_name_aya-vision-8b", padding_side="left" - ) - processor_kwargs = cls.prepare_processor_dict() - processor = AyaVisionProcessor.from_pretrained( - "hf-internal-testing/namespace-CohereForAI-repo_name_aya-vision-8b", - image_processor=image_processor, - tokenizer=tokenizer, - **processor_kwargs, - ) - processor.save_pretrained(cls.tmpdirname) - cls.image_token = processor.image_token @staticmethod def prepare_processor_dict(): return {"patch_size": 10, "img_size": 20} - def get_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer - - def get_image_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor - - def get_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs) - - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname, ignore_errors=True) + @unittest.skip(reason="Text needs image tokens, tested in other tests") + def test_processor_with_multiple_inputs(self): + pass - # Copied from tests.models.llava.test_processing_llava.LlavaProcessorTest.test_get_num_vision_tokens def test_get_num_vision_tokens(self): "Tests general functionality of the helper used internally in vLLM" diff --git a/tests/models/blip/test_processing_blip.py b/tests/models/blip/test_processing_blip.py index 0ee96029a82d..bb1e48034b23 100644 --- a/tests/models/blip/test_processing_blip.py +++ b/tests/models/blip/test_processing_blip.py @@ -11,20 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import shutil -import tempfile import unittest -import pytest - -from transformers.testing_utils import require_torch, require_vision +from transformers.testing_utils import require_vision from transformers.utils import is_vision_available from ...test_processing_common import ProcessorTesterMixin if is_vision_available(): - from transformers import AutoProcessor, BertTokenizer, BlipImageProcessor, BlipProcessor, PreTrainedTokenizerFast + from transformers import BlipProcessor @require_vision @@ -32,125 +28,6 @@ class BlipProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = BlipProcessor @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - - image_processor = BlipImageProcessor() - tokenizer = BertTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertModel") - - processor = BlipProcessor(image_processor, tokenizer) - - processor.save_pretrained(cls.tmpdirname) - - def get_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer - - def get_image_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor - - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname, ignore_errors=True) - - def test_save_load_pretrained_additional_features(self): - with tempfile.TemporaryDirectory() as tmpdir: - processor = BlipProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()) - processor.save_pretrained(tmpdir) - - tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") - image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0) - - processor = BlipProcessor.from_pretrained( - tmpdir, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 - ) - - self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) - self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast) - - self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, BlipImageProcessor) - - def test_image_processor(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = BlipProcessor(tokenizer=tokenizer, image_processor=image_processor) - - image_input = self.prepare_image_inputs() - - input_feat_extract = image_processor(image_input, return_tensors="np") - input_processor = processor(images=image_input, return_tensors="np") - - for key in input_feat_extract: - self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) - - def test_tokenizer(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = BlipProcessor(tokenizer=tokenizer, image_processor=image_processor) - - input_str = "lower newer" - - encoded_processor = processor(text=input_str) - - encoded_tok = tokenizer(input_str, return_token_type_ids=False) - - for key in encoded_tok: - self.assertListEqual(encoded_tok[key], encoded_processor[key]) - - def test_processor(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = BlipProcessor(tokenizer=tokenizer, image_processor=image_processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input) - - self.assertListEqual(list(inputs.keys()), ["pixel_values", "input_ids", "attention_mask"]) - - # test if it raises when no input is passed - with pytest.raises(ValueError): - processor() - - def test_tokenizer_decode(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = BlipProcessor(tokenizer=tokenizer, image_processor=image_processor) - - predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] - - decoded_processor = processor.batch_decode(predicted_ids) - decoded_tok = tokenizer.batch_decode(predicted_ids) - - self.assertListEqual(decoded_tok, decoded_processor) - - @require_torch - @require_vision - def test_unstructured_kwargs_batched(self): - if "image_processor" not in self.processor_class.get_attributes(): - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = ["lower newer", "upper older longer string"] - image_input = self.prepare_image_inputs(batch_size=2) - inputs = processor( - text=input_str, - images=image_input, - return_tensors="pt", - crop_size={"height": 214, "width": 214}, - size={"height": 214, "width": 214}, - padding="longest", - max_length=76, - ) - self.assertEqual(inputs["pixel_values"].shape[2], 214) - - self.assertEqual(len(inputs["input_ids"][0]), 24) + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") + return tokenizer_class.from_pretrained("hf-internal-testing/tiny-random-BertModel") diff --git a/tests/models/blip_2/test_processing_blip_2.py b/tests/models/blip_2/test_processing_blip_2.py index e5c17a11ce02..13294215e6c6 100644 --- a/tests/models/blip_2/test_processing_blip_2.py +++ b/tests/models/blip_2/test_processing_blip_2.py @@ -11,12 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import shutil -import tempfile import unittest -import pytest - from transformers.testing_utils import require_vision from transformers.utils import is_vision_available @@ -24,7 +20,7 @@ if is_vision_available(): - from transformers import AutoProcessor, Blip2Processor, BlipImageProcessor, GPT2Tokenizer, PreTrainedTokenizerFast + from transformers import Blip2Processor @require_vision @@ -32,89 +28,15 @@ class Blip2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = Blip2Processor @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - - image_processor = BlipImageProcessor() - tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-GPT2Model") - - processor = Blip2Processor(image_processor, tokenizer) - - processor.save_pretrained(cls.tmpdirname) - - def get_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer - - def get_image_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor - - def prepare_processor_dict(self): - return {"num_query_tokens": 1} + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") + return tokenizer_class.from_pretrained("hf-internal-testing/tiny-random-GPT2Model") @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname, ignore_errors=True) - - def test_save_load_pretrained_additional_features(self): - with tempfile.TemporaryDirectory() as tmpdir: - processor = Blip2Processor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()) - processor.save_pretrained(tmpdir) - - tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") - image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0) - - processor = Blip2Processor.from_pretrained( - tmpdir, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 - ) - - self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) - self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast) - - self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, BlipImageProcessor) - - def test_image_processor(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor) + def _setup_image_processor(cls): + image_processor_class = cls._get_component_class_from_processor("image_processor") + return image_processor_class.from_pretrained("hf-internal-testing/tiny-random-ViTModel") - image_input = self.prepare_image_inputs() - - input_feat_extract = image_processor(image_input, return_tensors="np") - input_processor = processor(images=image_input, return_tensors="np") - - for key in input_feat_extract: - self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) - - def test_processor(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - processor_kwargs = self.prepare_processor_dict() - - processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor, **processor_kwargs) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input) - - self.assertCountEqual(list(inputs.keys()), ["input_ids", "pixel_values", "attention_mask"]) - - # test if it raises when no input is passed - with pytest.raises(ValueError): - processor() - - def test_tokenizer_decode(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - processor_kwargs = self.prepare_processor_dict() - - processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor, **processor_kwargs) - - predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] - - decoded_processor = processor.batch_decode(predicted_ids) - decoded_tok = tokenizer.batch_decode(predicted_ids) - - self.assertListEqual(decoded_tok, decoded_processor) + @staticmethod + def prepare_processor_dict(): + return {"num_query_tokens": 1} diff --git a/tests/models/bridgetower/test_processing_bridgetower.py b/tests/models/bridgetower/test_processing_bridgetower.py index ebaa2e6a0d07..b8019b3e3fb1 100644 --- a/tests/models/bridgetower/test_processing_bridgetower.py +++ b/tests/models/bridgetower/test_processing_bridgetower.py @@ -11,8 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import shutil -import tempfile import unittest from transformers.testing_utils import require_torch, require_vision @@ -23,10 +21,7 @@ if is_vision_available(): from transformers import ( - AutoProcessor, - BridgeTowerImageProcessor, BridgeTowerProcessor, - RobertaTokenizerFast, ) @@ -35,28 +30,9 @@ class BridgeTowerProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = BridgeTowerProcessor @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - - image_processor = BridgeTowerImageProcessor() - tokenizer = RobertaTokenizerFast.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc") - - processor = BridgeTowerProcessor(image_processor, tokenizer) - - processor.save_pretrained(cls.tmpdirname) - - def get_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer - - def get_image_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor - - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname, ignore_errors=True) - - # Some kwargs tests are overridden from common tests to handle shortest_edge - # and size_divisor behaviour + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") + return tokenizer_class.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc") @require_torch @require_vision diff --git a/tests/models/chameleon/test_processing_chameleon.py b/tests/models/chameleon/test_processing_chameleon.py index 399483eb3e56..f8104b937ecf 100644 --- a/tests/models/chameleon/test_processing_chameleon.py +++ b/tests/models/chameleon/test_processing_chameleon.py @@ -13,20 +13,14 @@ # limitations under the License. """Testing suite for the PyTorch chameleon model.""" -import tempfile import unittest -from transformers import ChameleonProcessor, LlamaTokenizer +from transformers import ChameleonProcessor from transformers.testing_utils import get_tests_dir -from transformers.utils import is_vision_available from ...test_processing_common import ProcessorTesterMixin -if is_vision_available(): - from transformers import ChameleonImageProcessor - - SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @@ -34,16 +28,21 @@ class ChameleonProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = ChameleonProcessor @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - image_processor = ChameleonImageProcessor() - tokenizer = LlamaTokenizer(vocab_file=SAMPLE_VOCAB) + def _setup_test_attributes(cls, processor): + cls.image_token = processor.image_token + + @classmethod + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") + tokenizer = tokenizer_class(vocab_file=SAMPLE_VOCAB) tokenizer.pad_token_id = 0 tokenizer.sep_token_id = 1 tokenizer.add_special_tokens({"additional_special_tokens": [""]}) - processor = cls.processor_class(image_processor=image_processor, tokenizer=tokenizer, image_seq_length=2) - processor.save_pretrained(cls.tmpdirname) - cls.image_token = processor.image_token + return tokenizer + + @unittest.skip("Chameleon processor add a sep_token at the end of each sample") + def test_tokenizer_defaults(self): + pass def test_special_mm_token_truncation(self): """Tests that special vision tokens do not get truncated when `truncation=True` is set.""" @@ -60,7 +59,6 @@ def test_special_mm_token_truncation(self): truncation=None, padding=True, ) - with self.assertRaises(ValueError): _ = processor( text=input_str, diff --git a/tests/models/chinese_clip/test_processing_chinese_clip.py b/tests/models/chinese_clip/test_processing_chinese_clip.py index dab0d37773c9..6ed492118809 100644 --- a/tests/models/chinese_clip/test_processing_chinese_clip.py +++ b/tests/models/chinese_clip/test_processing_chinese_clip.py @@ -13,13 +13,8 @@ # limitations under the License. import os -import shutil -import tempfile import unittest -import pytest - -from transformers import BertTokenizer, BertTokenizerFast from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES from transformers.testing_utils import require_vision from transformers.utils import is_vision_available @@ -28,7 +23,7 @@ if is_vision_available(): - from transformers import ChineseCLIPImageProcessor, ChineseCLIPProcessor + from transformers import ChineseCLIPProcessor @require_vision @@ -36,9 +31,8 @@ class ChineseCLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = ChineseCLIPProcessor @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") vocab_tokens = [ "[UNK]", "[CLS]", @@ -59,10 +53,14 @@ def setUpClass(cls): "t", "shirt", ] - cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer: + vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + return tokenizer_class(vocab_file=vocab_file) + @classmethod + def _setup_image_processor(cls): + image_processor_class = cls._get_component_class_from_processor("image_processor") image_processor_map = { "do_resize": True, "size": {"height": 224, "width": 224}, @@ -73,127 +71,4 @@ def setUpClass(cls): "image_std": [0.26862954, 0.26130258, 0.27577711], "do_convert_rgb": True, } - tokenizer = cls.get_tokenizer() - image_processor = ChineseCLIPImageProcessor(**image_processor_map) - processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor) - processor.save_pretrained(cls.tmpdirname) - - @classmethod - def get_tokenizer(cls, **kwargs): - return BertTokenizer.from_pretrained(cls.tmpdirname, **kwargs) - - @classmethod - def get_rust_tokenizer(cls, **kwargs): - return BertTokenizerFast.from_pretrained(cls.tmpdirname, **kwargs) - - @classmethod - def get_image_processor(cls, **kwargs): - return ChineseCLIPImageProcessor.from_pretrained(cls.tmpdirname, **kwargs) - - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname, ignore_errors=True) - - def test_save_load_pretrained_default(self): - tokenizer_slow = self.get_tokenizer() - tokenizer_fast = self.get_rust_tokenizer() - image_processor = self.get_image_processor() - - with tempfile.TemporaryDirectory() as tmpdir: - processor_slow = ChineseCLIPProcessor(tokenizer=tokenizer_slow, image_processor=image_processor) - processor_slow.save_pretrained(tmpdir) - processor_slow = ChineseCLIPProcessor.from_pretrained(self.tmpdirname, use_fast=False) - - processor_fast = ChineseCLIPProcessor(tokenizer=tokenizer_fast, image_processor=image_processor) - processor_fast.save_pretrained(tmpdir) - processor_fast = ChineseCLIPProcessor.from_pretrained(self.tmpdirname) - - self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab()) - self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab()) - self.assertEqual(tokenizer_slow.get_vocab(), tokenizer_fast.get_vocab()) - self.assertIsInstance(processor_slow.tokenizer, BertTokenizer) - self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast) - - self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string()) - self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string()) - self.assertIsInstance(processor_slow.image_processor, ChineseCLIPImageProcessor) - self.assertIsInstance(processor_fast.image_processor, ChineseCLIPImageProcessor) - - def test_save_load_pretrained_additional_features(self): - with tempfile.TemporaryDirectory() as tmpdir: - processor = ChineseCLIPProcessor( - tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor() - ) - processor.save_pretrained(tmpdir) - - tokenizer_add_kwargs = self.get_tokenizer(cls_token="(CLS)", sep_token="(SEP)") - image_processor_add_kwargs = self.get_image_processor(do_normalize=False) - - processor = ChineseCLIPProcessor.from_pretrained( - tmpdir, cls_token="(CLS)", sep_token="(SEP)", do_normalize=False - ) - - self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) - self.assertIsInstance(processor.tokenizer, BertTokenizerFast) - - self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, ChineseCLIPImageProcessor) - - def test_image_processor(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor) - - image_input = self.prepare_image_inputs() - - input_feat_extract = image_processor(image_input, return_tensors="np") - input_processor = processor(images=image_input, return_tensors="np") - - for key in input_feat_extract: - self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) - - def test_tokenizer(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor) - - input_str = "Alexandra,T-shirt的价格是15便士。" - - encoded_processor = processor(text=input_str) - - encoded_tok = tokenizer(input_str) - - for key in encoded_tok: - self.assertListEqual(encoded_tok[key], encoded_processor[key]) - - def test_processor(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor) - - input_str = "Alexandra,T-shirt的价格是15便士。" - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input) - - self.assertSetEqual(set(inputs.keys()), {"input_ids", "token_type_ids", "attention_mask", "pixel_values"}) - - # test if it raises when no input is passed - with pytest.raises(ValueError): - processor() - - def test_tokenizer_decode(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor) - - predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] - - decoded_processor = processor.batch_decode(predicted_ids) - decoded_tok = tokenizer.batch_decode(predicted_ids) - - self.assertListEqual(decoded_tok, decoded_processor) + return image_processor_class(**image_processor_map) diff --git a/tests/models/clip/test_processing_clip.py b/tests/models/clip/test_processing_clip.py index 6ca9a47b29c7..d42d50aae570 100644 --- a/tests/models/clip/test_processing_clip.py +++ b/tests/models/clip/test_processing_clip.py @@ -12,13 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import shutil -import tempfile import unittest -import pytest - -from transformers import AutoTokenizer, CLIPTokenizer, CLIPTokenizerFast from transformers.testing_utils import require_vision from transformers.utils import is_vision_available @@ -26,143 +21,10 @@ if is_vision_available(): - from transformers import CLIPImageProcessor, CLIPProcessor - - -TEST_MODEL_PATH = "openai/clip-vit-base-patch32" + from transformers import CLIPProcessor @require_vision class CLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = CLIPProcessor - - @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - tokenizer = AutoTokenizer.from_pretrained(TEST_MODEL_PATH) - image_processor = CLIPImageProcessor.from_pretrained(TEST_MODEL_PATH) - processor = CLIPProcessor( - image_processor=image_processor, - tokenizer=tokenizer, - ) - processor.save_pretrained(cls.tmpdirname) - - @classmethod - def get_tokenizer(cls, **kwargs): - return CLIPTokenizer.from_pretrained(cls.tmpdirname, **kwargs) - - @classmethod - def get_rust_tokenizer(cls, **kwargs): - return CLIPTokenizerFast.from_pretrained(cls.tmpdirname, **kwargs) - - @classmethod - def get_image_processor(cls, **kwargs): - return CLIPImageProcessor.from_pretrained(cls.tmpdirname, **kwargs) - - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname) - - def test_save_load_pretrained_default(self): - tokenizer_slow = self.get_tokenizer() - tokenizer_fast = self.get_rust_tokenizer() - image_processor = self.get_image_processor() - - with tempfile.TemporaryDirectory() as tmpdir: - processor_slow = CLIPProcessor(tokenizer=tokenizer_slow, image_processor=image_processor) - processor_slow.save_pretrained(tmpdir) - processor_slow = CLIPProcessor.from_pretrained(tmpdir, use_fast=False) - - processor_fast = CLIPProcessor(tokenizer=tokenizer_fast, image_processor=image_processor) - processor_fast.save_pretrained(tmpdir) - processor_fast = CLIPProcessor.from_pretrained(tmpdir) - - self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab()) - self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab()) - self.assertEqual(tokenizer_slow.get_vocab(), tokenizer_fast.get_vocab()) - self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer) - self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast) - - self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string()) - self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string()) - self.assertIsInstance(processor_slow.image_processor, CLIPImageProcessor) - self.assertIsInstance(processor_fast.image_processor, CLIPImageProcessor) - - def test_save_load_pretrained_additional_features(self): - with tempfile.TemporaryDirectory() as tmpdir: - processor = CLIPProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()) - processor.save_pretrained(tmpdir) - - tokenizer_add_kwargs = CLIPTokenizer.from_pretrained(tmpdir, bos_token="(BOS)", eos_token="(EOS)") - image_processor_add_kwargs = CLIPImageProcessor.from_pretrained( - tmpdir, do_normalize=False, padding_value=1.0 - ) - - processor = CLIPProcessor.from_pretrained( - tmpdir, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 - ) - - self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) - self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast) - - self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, CLIPImageProcessor) - - def test_image_processor(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor) - - image_input = self.prepare_image_inputs() - - input_image_proc = image_processor(image_input, return_tensors="np") - input_processor = processor(images=image_input, return_tensors="np") - - for key in input_image_proc: - self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2) - - def test_tokenizer(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor) - - input_str = "lower newer" - - encoded_processor = processor(text=input_str) - - encoded_tok = tokenizer(input_str) - - for key in encoded_tok: - self.assertListEqual(encoded_tok[key], encoded_processor[key]) - - def test_processor(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input) - - self.assertSetEqual(set(inputs.keys()), {"input_ids", "attention_mask", "pixel_values"}) - - # test if it raises when no input is passed - with pytest.raises(ValueError): - processor() - - def test_tokenizer_decode(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor) - - predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] - - decoded_processor = processor.batch_decode(predicted_ids) - decoded_tok = tokenizer.batch_decode(predicted_ids) - - self.assertListEqual(decoded_tok, decoded_processor) + model_id = "openai/clip-vit-base-patch32" diff --git a/tests/models/clipseg/test_processing_clipseg.py b/tests/models/clipseg/test_processing_clipseg.py index f4fbf2ebde3e..73d0e8d74c3f 100644 --- a/tests/models/clipseg/test_processing_clipseg.py +++ b/tests/models/clipseg/test_processing_clipseg.py @@ -14,13 +14,10 @@ import json import os -import shutil -import tempfile import unittest import pytest -from transformers import CLIPTokenizer, CLIPTokenizerFast from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES from transformers.testing_utils import require_vision from transformers.utils import is_vision_available @@ -29,28 +26,31 @@ if is_vision_available(): - from transformers import CLIPSegProcessor, ViTImageProcessor + from transformers import CLIPSegProcessor @require_vision class CLIPSegProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = CLIPSegProcessor - def setUp(self): - self.tmpdirname = tempfile.mkdtemp() - + @classmethod + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l", "w", "r", "t", "low", "er", "lowest", "newer", "wider", "", "<|startoftext|>", "<|endoftext|>"] # fmt: skip vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "l o", "lo w", "e r", ""] - self.special_tokens_map = {"unk_token": ""} - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as fp: + vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") - with open(self.merges_file, "w", encoding="utf-8") as fp: + with open(merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges)) + return tokenizer_class.from_pretrained(cls.tmpdirname) + @classmethod + def _setup_image_processor(cls): + image_processor_class = cls._get_component_class_from_processor("image_processor") image_processor_map = { "do_resize": True, "size": 20, @@ -60,102 +60,10 @@ def setUp(self): "image_mean": [0.48145466, 0.4578275, 0.40821073], "image_std": [0.26862954, 0.26130258, 0.27577711], } - image_processor = ViTImageProcessor(**image_processor_map) - processor = CLIPSegProcessor(tokenizer=self.get_tokenizer(), image_processor=image_processor) - processor.save_pretrained(self.tmpdirname) - - image_processor = ViTImageProcessor.from_pretrained(self.tmpdirname) - image_processor.save_pretrained(self.tmpdirname) - tokenizer = CLIPTokenizer.from_pretrained(self.tmpdirname) - tokenizer.save_pretrained(self.tmpdirname) - - def get_tokenizer(self, **kwargs): - return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs) - - def get_rust_tokenizer(self, **kwargs): - return CLIPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) - - def get_image_processor(self, **kwargs): - return ViTImageProcessor.from_pretrained(self.tmpdirname, **kwargs) - - def tearDown(self): - shutil.rmtree(self.tmpdirname) - - def test_save_load_pretrained_default(self): - tokenizer_slow = self.get_tokenizer() - tokenizer_fast = self.get_rust_tokenizer() - image_processor = self.get_image_processor() - - processor_slow = CLIPSegProcessor(tokenizer=tokenizer_slow, image_processor=image_processor) - processor_slow.save_pretrained(self.tmpdirname) - processor_slow = CLIPSegProcessor.from_pretrained(self.tmpdirname, use_fast=False) - - processor_fast = CLIPSegProcessor(tokenizer=tokenizer_fast, image_processor=image_processor) - processor_fast.save_pretrained(self.tmpdirname) - processor_fast = CLIPSegProcessor.from_pretrained(self.tmpdirname) - - self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab()) - self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab()) - self.assertEqual(tokenizer_slow.get_vocab(), tokenizer_fast.get_vocab()) - self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer) - self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast) - - self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string()) - self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string()) - self.assertIsInstance(processor_slow.image_processor, ViTImageProcessor) - self.assertIsInstance(processor_fast.image_processor, ViTImageProcessor) - - def test_save_load_pretrained_additional_features(self): - processor = CLIPSegProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()) - processor.save_pretrained(self.tmpdirname) - - tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") - image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0) - - processor = CLIPSegProcessor.from_pretrained( - self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 - ) - - self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) - self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast) - - self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, ViTImageProcessor) - - def test_image_processor(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor) - - image_input = self.prepare_image_inputs() - - input_feat_extract = image_processor(image_input, return_tensors="np") - input_processor = processor(images=image_input, return_tensors="np") - - for key in input_feat_extract: - self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) - - def test_tokenizer(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor) - - input_str = "lower newer" - - encoded_processor = processor(text=input_str) - - encoded_tok = tokenizer(input_str) - - for key in encoded_tok: - self.assertListEqual(encoded_tok[key], encoded_processor[key]) + return image_processor_class(**image_processor_map) def test_processor_text(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor) + processor = self.get_processor() input_str = "lower newer" image_input = self.prepare_image_inputs() @@ -169,10 +77,7 @@ def test_processor_text(self): processor() def test_processor_visual_prompt(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor) + processor = self.get_processor() image_input = self.prepare_image_inputs() visual_prompt_input = self.prepare_image_inputs() @@ -184,16 +89,3 @@ def test_processor_visual_prompt(self): # test if it raises when no input is passed with pytest.raises(ValueError): processor() - - def test_tokenizer_decode(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor) - - predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] - - decoded_processor = processor.batch_decode(predicted_ids) - decoded_tok = tokenizer.batch_decode(predicted_ids) - - self.assertListEqual(decoded_tok, decoded_processor) diff --git a/tests/models/cohere2_vision/test_processing_cohere2_vision.py b/tests/models/cohere2_vision/test_processing_cohere2_vision.py index 963deae9feff..2cbb67b7b203 100644 --- a/tests/models/cohere2_vision/test_processing_cohere2_vision.py +++ b/tests/models/cohere2_vision/test_processing_cohere2_vision.py @@ -12,12 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import shutil -import tempfile import unittest -from transformers import AutoProcessor, AutoTokenizer, Cohere2VisionProcessor -from transformers.testing_utils import require_read_token, require_torch, require_vision +from transformers import Cohere2VisionProcessor +from transformers.testing_utils import require_read_token, require_vision from transformers.utils import is_torch_available, is_torchvision_available from ...test_processing_common import ProcessorTesterMixin, url_to_local_path @@ -27,7 +25,7 @@ import torch if is_torchvision_available(): - from transformers import Cohere2VisionImageProcessorFast + pass @require_read_token @@ -37,41 +35,18 @@ class Cohere2VisionProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = Cohere2VisionProcessor @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - image_processor = Cohere2VisionImageProcessorFast( + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") + return tokenizer_class.from_pretrained("CohereLabs/command-a-vision-07-2025") + + @classmethod + def _setup_image_processor(cls): + image_processor_class = cls._get_component_class_from_processor("image_processor") + return image_processor_class( size={"height": 20, "width": 20}, max_patches=3, ) - tokenizer = AutoTokenizer.from_pretrained("CohereLabs/command-a-vision-07-2025") - - processor_kwargs = cls.prepare_processor_dict() - processor = Cohere2VisionProcessor( - image_processor=image_processor, - tokenizer=tokenizer, - **processor_kwargs, - ) - processor.save_pretrained(cls.tmpdirname) - cls.image_token = processor.image_token - - @staticmethod - def prepare_processor_dict(): - return {} - - def get_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer - - def get_image_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor - - def get_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs) - - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname, ignore_errors=True) - @require_torch def test_process_interleaved_images_videos(self): processor = self.get_processor() diff --git a/tests/models/colpali/test_processing_colpali.py b/tests/models/colpali/test_processing_colpali.py index ca849408af0e..b3874a8ff6df 100644 --- a/tests/models/colpali/test_processing_colpali.py +++ b/tests/models/colpali/test_processing_colpali.py @@ -13,13 +13,10 @@ # limitations under the License. """Testing suite for the ColPali processor.""" -import shutil -import tempfile import unittest import torch -from transformers import GemmaTokenizer from transformers.models.colpali.processing_colpali import ColPaliProcessor from transformers.testing_utils import get_tests_dir, require_torch, require_vision from transformers.utils import is_vision_available @@ -28,11 +25,7 @@ if is_vision_available(): - from transformers import ( - ColPaliProcessor, - PaliGemmaProcessor, - SiglipImageProcessor, - ) + from transformers import ColPaliProcessor, GemmaTokenizer SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @@ -42,19 +35,24 @@ class ColPaliProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = ColPaliProcessor @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - image_processor = SiglipImageProcessor.from_pretrained("google/siglip-so400m-patch14-384") - image_processor.image_seq_length = 0 - tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True) - processor = PaliGemmaProcessor(image_processor=image_processor, tokenizer=tokenizer) - processor.save_pretrained(cls.tmpdirname) + def _setup_tokenizer(cls): + return GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True) @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname, ignore_errors=True) + def _setup_image_processor(cls): + image_processor_class = cls._get_component_class_from_processor("image_processor") + image_processor = image_processor_class.from_pretrained("google/siglip-so400m-patch14-384") + image_processor.image_seq_length = 0 + return image_processor + + @unittest.skip("ColpaliProcessor can only process one of text or images at a time") + def test_processor_with_multiple_inputs(self): + pass + + @unittest.skip("ColpaliProcessor adds a prefix and suffix to the text") + def test_tokenizer_defaults(self): + pass - # Copied from tests.models.llava.test_processing_llava.LlavaProcessorTest.test_get_num_vision_tokens def test_get_num_vision_tokens(self): "Tests general functionality of the helper used internally in vLLM" diff --git a/tests/models/colqwen2/test_processing_colqwen2.py b/tests/models/colqwen2/test_processing_colqwen2.py index 5923754f717c..4a684b317d70 100644 --- a/tests/models/colqwen2/test_processing_colqwen2.py +++ b/tests/models/colqwen2/test_processing_colqwen2.py @@ -14,13 +14,11 @@ # limitations under the License. """Testing suite for the ColQwen2 processor.""" -import shutil -import tempfile import unittest import torch +from parameterized import parameterized -from transformers import AutoProcessor, Qwen2VLProcessor from transformers.models.colqwen2.processing_colqwen2 import ColQwen2Processor from transformers.testing_utils import get_tests_dir, require_torch, require_vision from transformers.utils import is_vision_available @@ -40,24 +38,21 @@ @require_vision class ColQwen2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = ColQwen2Processor + model_id = "vidore/colqwen2-v1.0-hf" - @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct") - processor.save_pretrained(cls.tmpdirname) - - def get_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer + @parameterized.expand([(1, "pt"), (2, "pt")]) + @unittest.skip("Not tested before, to investigate") + def test_apply_chat_template_image(self, batch_size, return_tensors): + pass - def get_image_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor + @unittest.skip("ColQwen2Processor can only process one of text or images at a time") + def test_processor_with_multiple_inputs(self): + pass - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname) + @unittest.skip("ColQwen2Processor adds a prefix and suffix to the text") + def test_tokenizer_defaults(self): + pass - # Copied from tests.models.llava.test_processing_llava.LlavaProcessorTest.test_get_num_vision_tokens def test_get_num_vision_tokens(self): "Tests general functionality of the helper used internally in vLLM" @@ -282,6 +277,10 @@ def test_model_input_names(self): self.assertSetEqual(set(inputs.keys()), set(processor.model_input_names)) - @unittest.skip("ColPali can't process text+image inputs at the same time") + @unittest.skip("ColQwen2Processor can't process text+image inputs at the same time") def test_processor_text_has_no_visual(self): pass + + @unittest.skip("ColQwen2Processor adds a batch dimension to the pixel_values") + def test_image_processor_defaults(self): + pass diff --git a/tests/models/csm/test_processing_csm.py b/tests/models/csm/test_processing_csm.py index 4587e19c41b7..2726daacda21 100644 --- a/tests/models/csm/test_processing_csm.py +++ b/tests/models/csm/test_processing_csm.py @@ -13,8 +13,6 @@ # limitations under the License. import json -import shutil -import tempfile import unittest import jinja2 @@ -35,23 +33,21 @@ class CsmProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = CsmProcessor audio_input_name = "input_values" + model_id = "hf-internal-testing/namespace-sesame-repo_name_csm-1b" @classmethod - def setUpClass(cls): - cls.checkpoint = "hf-internal-testing/namespace-sesame-repo_name_csm-1b" - processor = CsmProcessor.from_pretrained(cls.checkpoint) + def _setup_test_attributes(cls, processor): cls.audio_token = processor.audio_token cls.audio_token_id = processor.audio_token_id cls.pad_token_id = processor.tokenizer.pad_token_id cls.bos_token_id = processor.tokenizer.bos_token_id - cls.tmpdirname = tempfile.mkdtemp() - processor.save_pretrained(cls.tmpdirname) - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname, ignore_errors=True) + @unittest.skip("CsmProcessor modifies the tokenizer inputs") + def test_tokenizer_defaults(self): + pass - def prepare_processor_dict(self): + @staticmethod + def prepare_processor_dict(): return {"chat_template": "\n{%- for message in messages %}\n {#-- Validate role is a stringified integer --#}\n {%- if not message['role'] is string or not message['role'].isdigit() %}\n {{- raise_exception(\"The role must be an integer or a stringified integer (e.g. '0') designating the speaker id\") }}\n {%- endif %}\n\n {#-- Validate content is a list --#}\n {%- set content = message['content'] %}\n {%- if content is not iterable or content is string %}\n {{- raise_exception(\"The content must be a list\") }}\n {%- endif %}\n\n {#-- Collect content types --#}\n {%- set content_types = content | map(attribute='type') | list %}\n {%- set is_last = loop.last %}\n\n {#-- Last message validation --#}\n {%- if is_last %}\n {%- if 'text' not in content_types %}\n {{- raise_exception(\"The last message must include one item of type 'text'\") }}\n {%- elif (content_types | select('equalto', 'text') | list | length > 1) or (content_types | select('equalto', 'audio') | list | length > 1) %}\n {{- raise_exception(\"At most two items are allowed in the last message: one 'text' and one 'audio'\") }}\n {%- endif %}\n\n {#-- All other messages validation --#}\n {%- else %}\n {%- if content_types | select('equalto', 'text') | list | length != 1\n or content_types | select('equalto', 'audio') | list | length != 1 %}\n {{- raise_exception(\"Each message (except the last) must contain exactly one 'text' and one 'audio' item\") }}\n {%- elif content_types | reject('in', ['text', 'audio']) | list | length > 0 %}\n {{- raise_exception(\"Only 'text' and 'audio' types are allowed in content\") }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n\n{%- for message in messages %}\n {{- bos_token }}\n {{- '[' + message['role'] + ']' }}\n {{- message['content'][0]['text'] }}\n {{- eos_token }}\n {%- if message['content']|length > 1 %}\n {{- '<|AUDIO|><|audio_eos|>' }}\n {%- endif %}\n{%- endfor %}\n"} # fmt: skip def test_chat_template_is_saved(self): diff --git a/tests/models/deepseek_vl/test_processing_deepseek_vl.py b/tests/models/deepseek_vl/test_processing_deepseek_vl.py index e96acfd80eb4..beabe0262f0b 100644 --- a/tests/models/deepseek_vl/test_processing_deepseek_vl.py +++ b/tests/models/deepseek_vl/test_processing_deepseek_vl.py @@ -12,43 +12,30 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import tempfile import unittest -from transformers import DeepseekVLProcessor, LlamaTokenizer +from transformers import DeepseekVLProcessor from transformers.testing_utils import get_tests_dir -from transformers.utils import is_vision_available from ...test_processing_common import ProcessorTesterMixin -if is_vision_available(): - from transformers import DeepseekVLImageProcessor - - SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") class DeepseekVLProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = DeepseekVLProcessor - def setUp(self): - self.tmpdirname = tempfile.mkdtemp() - image_processor = DeepseekVLImageProcessor() - tokenizer = LlamaTokenizer( + @classmethod + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") + return tokenizer_class( vocab_file=SAMPLE_VOCAB, extra_special_tokens={ "pad_token": "<|end▁of▁sentence|>", "image_token": "", }, ) - processor_kwargs = self.prepare_processor_dict() - processor = self.processor_class( - image_processor=image_processor, - tokenizer=tokenizer, - **processor_kwargs, - ) - processor.save_pretrained(self.tmpdirname) @staticmethod def prepare_processor_dict(): diff --git a/tests/models/deepseek_vl_hybrid/test_processing_deepseek_vl_hybrid.py b/tests/models/deepseek_vl_hybrid/test_processing_deepseek_vl_hybrid.py index 46178e30f671..b643fbc7d785 100644 --- a/tests/models/deepseek_vl_hybrid/test_processing_deepseek_vl_hybrid.py +++ b/tests/models/deepseek_vl_hybrid/test_processing_deepseek_vl_hybrid.py @@ -12,43 +12,30 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import tempfile import unittest -from transformers import DeepseekVLHybridProcessor, LlamaTokenizer +from transformers import DeepseekVLHybridProcessor from transformers.testing_utils import get_tests_dir -from transformers.utils import is_vision_available from ...test_processing_common import ProcessorTesterMixin -if is_vision_available(): - from transformers import DeepseekVLHybridImageProcessor - - SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") class DeepseekVLHybridProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = DeepseekVLHybridProcessor - def setUp(self): - self.tmpdirname = tempfile.mkdtemp() - image_processor = DeepseekVLHybridImageProcessor() - tokenizer = LlamaTokenizer( + @classmethod + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") + return tokenizer_class( vocab_file=SAMPLE_VOCAB, extra_special_tokens={ "pad_token": "<|end▁of▁sentence|>", "image_token": "", }, ) - processor_kwargs = self.prepare_processor_dict() - processor = self.processor_class( - image_processor=image_processor, - tokenizer=tokenizer, - **processor_kwargs, - ) - processor.save_pretrained(self.tmpdirname) @staticmethod def prepare_processor_dict(): diff --git a/tests/models/donut/test_processing_donut.py b/tests/models/donut/test_processing_donut.py index 272f1fd82341..b3a8732a7c93 100644 --- a/tests/models/donut/test_processing_donut.py +++ b/tests/models/donut/test_processing_donut.py @@ -13,30 +13,17 @@ # limitations under the License. -import tempfile import unittest -from transformers import DonutImageProcessor, DonutProcessor, XLMRobertaTokenizerFast +from transformers import DonutProcessor from ...test_processing_common import ProcessorTesterMixin class DonutProcessorTest(ProcessorTesterMixin, unittest.TestCase): - from_pretrained_id = "naver-clova-ix/donut-base" + model_id = "naver-clova-ix/donut-base" processor_class = DonutProcessor - @classmethod - def setUpClass(cls): - cls.processor = DonutProcessor.from_pretrained(cls.from_pretrained_id) - cls.tmpdirname = tempfile.mkdtemp() - - image_processor = DonutImageProcessor() - tokenizer = XLMRobertaTokenizerFast.from_pretrained(cls.from_pretrained_id) - - processor = DonutProcessor(image_processor, tokenizer) - - processor.save_pretrained(cls.tmpdirname) - def test_token2json(self): expected_json = { "name": "John Doe", @@ -58,6 +45,7 @@ def test_token2json(self): "text\nwith\nnewlines" "" ) - actual_json = self.processor.token2json(sequence) + processor = self.get_processor() + actual_json = processor.token2json(sequence) self.assertDictEqual(actual_json, expected_json) diff --git a/tests/models/emu3/test_processing_emu3.py b/tests/models/emu3/test_processing_emu3.py index a87dd314d452..9b1fa66d0a62 100644 --- a/tests/models/emu3/test_processing_emu3.py +++ b/tests/models/emu3/test_processing_emu3.py @@ -13,28 +13,26 @@ # limitations under the License. """Testing suite for the PyTorch emu3 model.""" -import tempfile import unittest import numpy as np -from transformers import Emu3Processor, GPT2TokenizerFast -from transformers.utils import is_vision_available +from transformers import Emu3Processor from ...test_processing_common import ProcessorTesterMixin -if is_vision_available(): - from transformers import Emu3ImageProcessor - - class Emu3ProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = Emu3Processor @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - image_processor = Emu3ImageProcessor(min_pixels=28 * 28, max_pixels=56 * 56) + def _setup_image_processor(cls): + image_processor_class = cls._get_component_class_from_processor("image_processor") + return image_processor_class(min_pixels=28 * 28, max_pixels=56 * 56) + + @classmethod + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") extra_special_tokens = { "image_token": "", "boi_token": "<|image start|>", @@ -42,16 +40,10 @@ def setUpClass(cls): "image_wrapper_token": "<|image token|>", "eof_token": "<|extra_201|>", } - tokenizer = GPT2TokenizerFast.from_pretrained( - "openai-community/gpt2", extra_special_tokens=extra_special_tokens - ) + tokenizer = tokenizer_class.from_pretrained("openai-community/gpt2", extra_special_tokens=extra_special_tokens) tokenizer.pad_token_id = 0 tokenizer.sep_token_id = 1 - processor = cls.processor_class( - image_processor=image_processor, tokenizer=tokenizer, chat_template="dummy_template" - ) - processor.save_pretrained(cls.tmpdirname) - cls.image_token = processor.image_token + return tokenizer @staticmethod def prepare_processor_dict(): diff --git a/tests/models/evolla/test_processing_evolla.py b/tests/models/evolla/test_processing_evolla.py index e1adfb996b61..cafbb49661f3 100644 --- a/tests/models/evolla/test_processing_evolla.py +++ b/tests/models/evolla/test_processing_evolla.py @@ -13,8 +13,6 @@ # limitations under the License. import random -import shutil -import tempfile import unittest from transformers import ( @@ -38,15 +36,12 @@ @require_torch class EvollaProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = EvollaProcessor + model_id = "westlake-repl/Evolla-10B-hf" + input_keys = ["protein_input_ids", "protein_attention_mask", "input_ids", "attention_mask"] - def setUp(self): - self.tmpdirname = tempfile.mkdtemp() - - processor = EvollaProcessor.from_pretrained("westlake-repl/Evolla-10B-hf") - - processor.save_pretrained(self.tmpdirname) - - self.input_keys = ["protein_input_ids", "protein_attention_mask", "input_ids", "attention_mask"] + @unittest.skip("EvollaProcessor requires `messages_list` and `proteins` inputs.") + def test_processor_with_multiple_inputs(self): + pass def prepare_input_and_expected_output(self): amino_acid_sequence = "AAAA" @@ -148,31 +143,9 @@ def prepare_input_and_expected_output(self): ] return protein_dict, message, expected_output - def test_processor(self): - protein_tokenizer = self.get_protein_tokenizer() - tokenizer = self.get_tokenizer() - - processor = EvollaProcessor(protein_tokenizer, tokenizer) - - protein_dict, message, expected_output = self.prepare_input_and_expected_output() - inputs = processor(proteins=[protein_dict], messages_list=[message]) - - # check if the input is correct - for key, value in expected_output.items(): - self.assertTrue( - torch.equal(inputs[key], value), - f"inputs[key] is {inputs[key]} and expected_output[key] is {value}", - ) - - def get_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer - def get_protein_tokenizer(self, **kwargs): return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).protein_tokenizer - def tearDown(self): - shutil.rmtree(self.tmpdirname) - def prepare_inputs_single(self): proteins = { "aa_seq": "".join(random.choices(EVOLLA_VALID_AA, k=100)), @@ -269,27 +242,8 @@ def prepare_inputs(self, protein_types="pair"): messages_list.append(messages) return proteins, messages_list - def test_tokenizer_decode(self): - protein_tokenizer = self.get_protein_tokenizer() - tokenizer = self.get_tokenizer() - - processor = EvollaProcessor(tokenizer=tokenizer, protein_tokenizer=protein_tokenizer, return_tensors="pt") - - predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] - - decoded_processor = processor.batch_decode(predicted_ids) - decoded_tok = tokenizer.batch_decode(predicted_ids) - - self.assertListEqual(decoded_tok, decoded_processor) - def test_model_input_names(self): - protein_tokenizer = self.get_protein_tokenizer() - tokenizer = self.get_tokenizer() - - processor = EvollaProcessor(tokenizer=tokenizer, protein_tokenizer=protein_tokenizer) + processor = self.get_processor() proteins, messages_list = self.prepare_inputs() - inputs = processor(messages_list=messages_list, proteins=proteins, padding="longest", return_tensors="pt") - - # For now the processor supports only ['pixel_values', 'input_ids', 'attention_mask'] self.assertSetEqual(set(inputs.keys()), set(self.input_keys)) diff --git a/tests/models/flava/test_processing_flava.py b/tests/models/flava/test_processing_flava.py index 52a957f2d60f..9b866c689b83 100644 --- a/tests/models/flava/test_processing_flava.py +++ b/tests/models/flava/test_processing_flava.py @@ -13,14 +13,8 @@ # limitations under the License. import os -import random -import shutil -import tempfile import unittest -import pytest - -from transformers import BertTokenizer, BertTokenizerFast from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES from transformers.testing_utils import require_vision from transformers.utils import is_vision_available @@ -29,7 +23,7 @@ if is_vision_available(): - from transformers import FlavaImageProcessor, FlavaProcessor + from transformers import FlavaProcessor from transformers.models.flava.image_processing_flava import ( FLAVA_CODEBOOK_MEAN, FLAVA_CODEBOOK_STD, @@ -42,15 +36,9 @@ class FlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = FlavaProcessor - def setUp(self): - self.tmpdirname = tempfile.mkdtemp() - - vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "want", "##want", "##ed", "wa", "un", "runn", "##ing", ",", "low", "lowest"] # fmt: skip - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - - with open(self.vocab_file, "w", encoding="utf-8") as fp: - fp.write("".join([x + "\n" for x in vocab_tokens])) - + @classmethod + def _setup_image_processor(cls): + image_processor_class = cls._get_component_class_from_processor("image_processor") image_processor_map = { "image_mean": FLAVA_IMAGE_MEAN, "image_std": FLAVA_IMAGE_STD, @@ -75,151 +63,15 @@ def setUp(self): "codebook_image_std": FLAVA_CODEBOOK_STD, } - image_processor = FlavaImageProcessor(**image_processor_map) - processor = FlavaProcessor(tokenizer=self.get_tokenizer(), image_processor=image_processor) - processor.save_pretrained(self.tmpdirname) - - image_processor = FlavaImageProcessor.from_pretrained(self.tmpdirname) - image_processor.save_pretrained(self.tmpdirname) - tokenizer = BertTokenizer.from_pretrained(self.tmpdirname) - tokenizer.save_pretrained(self.tmpdirname) - - def get_tokenizer(self, **kwargs): - return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs) - - def get_rust_tokenizer(self, **kwargs): - return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) - - def get_image_processor(self, **kwargs): - return FlavaImageProcessor.from_pretrained(self.tmpdirname, **kwargs) - - def tearDown(self): - shutil.rmtree(self.tmpdirname) - - def test_save_load_pretrained_default(self): - tokenizer_slow = self.get_tokenizer() - tokenizer_fast = self.get_rust_tokenizer() - image_processor = self.get_image_processor() - - processor_slow = FlavaProcessor(tokenizer=tokenizer_slow, image_processor=image_processor) - processor_slow.save_pretrained(self.tmpdirname) - processor_slow = FlavaProcessor.from_pretrained(self.tmpdirname, use_fast=False) - - processor_fast = FlavaProcessor(tokenizer=tokenizer_fast, image_processor=image_processor) - processor_fast.save_pretrained(self.tmpdirname) - processor_fast = FlavaProcessor.from_pretrained(self.tmpdirname) - - self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab()) - self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab()) - self.assertEqual(tokenizer_slow.get_vocab(), tokenizer_fast.get_vocab()) - self.assertIsInstance(processor_slow.tokenizer, BertTokenizer) - self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast) - - self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string()) - self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string()) - self.assertIsInstance(processor_slow.image_processor, FlavaImageProcessor) - self.assertIsInstance(processor_fast.image_processor, FlavaImageProcessor) - - def test_save_load_pretrained_additional_features(self): - processor = FlavaProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()) - processor.save_pretrained(self.tmpdirname) - - tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") - image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0) - - processor = FlavaProcessor.from_pretrained( - self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 - ) - - self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) - self.assertIsInstance(processor.tokenizer, BertTokenizerFast) - - self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, FlavaImageProcessor) - - def test_image_processor(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor) - - image_input = self.prepare_image_inputs() + image_processor = image_processor_class(**image_processor_map) + return image_processor - input_feat_extract = image_processor(image_input, return_tensors="np") - input_processor = processor(images=image_input, return_tensors="np") - - for key in input_feat_extract: - self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) - - # With rest of the args - random.seed(1234) - input_feat_extract = image_processor( - image_input, return_image_mask=True, return_codebook_pixels=True, return_tensors="np" - ) - random.seed(1234) - input_processor = processor( - images=image_input, return_image_mask=True, return_codebook_pixels=True, return_tensors="np" - ) - - for key in input_feat_extract: - self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) - - def test_tokenizer(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor) - - input_str = "lower newer" - - encoded_processor = processor(text=input_str) - - encoded_tok = tokenizer(input_str) - - for key in encoded_tok: - self.assertListEqual(encoded_tok[key], encoded_processor[key]) - - def test_processor(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input) - - self.assertSetEqual(set(inputs.keys()), {"input_ids", "token_type_ids", "attention_mask", "pixel_values"}) - - # add extra args - inputs = processor(text=input_str, images=image_input, return_codebook_pixels=True, return_image_mask=True) - - self.assertSetEqual( - set(inputs.keys()), - { - "input_ids", - "token_type_ids", - "attention_mask", - "pixel_values", - "codebook_pixel_values", - "bool_masked_pos", - }, - ) - - # test if it raises when no input is passed - with pytest.raises(ValueError): - processor() - - def test_tokenizer_decode(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor) - - predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] - - decoded_processor = processor.batch_decode(predicted_ids) - decoded_tok = tokenizer.batch_decode(predicted_ids) + @classmethod + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") + vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "want", "##want", "##ed", "wa", "un", "runn", "##ing", ",", "low", "lowest"] # fmt: skip + vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(vocab_file, "w", encoding="utf-8") as fp: + fp.write("".join([x + "\n" for x in vocab_tokens])) - self.assertListEqual(decoded_tok, decoded_processor) + return tokenizer_class.from_pretrained(cls.tmpdirname) diff --git a/tests/models/florence2/test_processing_florence2.py b/tests/models/florence2/test_processing_florence2.py index 351e4768e53d..cf535e77020d 100644 --- a/tests/models/florence2/test_processing_florence2.py +++ b/tests/models/florence2/test_processing_florence2.py @@ -11,13 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import shutil -import tempfile import unittest -from transformers import AutoProcessor, BartTokenizerFast, Florence2Processor +from transformers import Florence2Processor from transformers.testing_utils import require_torch, require_vision -from transformers.utils import is_torch_available, is_vision_available +from transformers.utils import is_torch_available from ...test_processing_common import ProcessorTesterMixin @@ -25,9 +23,6 @@ if is_torch_available(): import torch -if is_vision_available(): - from transformers import CLIPImageProcessor - @require_torch @require_vision @@ -35,19 +30,24 @@ class Florence2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = Florence2Processor @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - - image_processor = CLIPImageProcessor.from_pretrained("florence-community/Florence-2-base") + def _setup_image_processor(cls): + image_processor_class = cls._get_component_class_from_processor("image_processor") + image_processor = image_processor_class.from_pretrained("florence-community/Florence-2-base") image_processor.image_seq_length = 0 - tokenizer = BartTokenizerFast.from_pretrained("florence-community/Florence-2-base") + return image_processor + + @classmethod + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") + tokenizer = tokenizer_class.from_pretrained("florence-community/Florence-2-base") tokenizer.image_token = "" tokenizer.image_token_id = tokenizer.encode(tokenizer.image_token, add_special_tokens=False)[0] tokenizer.extra_special_tokens = {"image_token": ""} - processor_kwargs = cls.prepare_processor_dict() - processor = Florence2Processor(image_processor, tokenizer, **processor_kwargs) - processor.save_pretrained(cls.tmpdirname) - cls.image_token = processor.image_token + return tokenizer + + @unittest.skip("Florence2Processor adds prefix and suffix tokens to the text") + def test_tokenizer_defaults(self): + pass @staticmethod def prepare_processor_dict(): @@ -67,16 +67,6 @@ def prepare_processor_dict(): } } - def get_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer - - def get_image_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor - - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname, ignore_errors=True) - def test_construct_prompts(self): processor = self.processor_class.from_pretrained(self.tmpdirname) diff --git a/tests/models/fuyu/test_processing_fuyu.py b/tests/models/fuyu/test_processing_fuyu.py index d88843c6d158..9d825fb7a0ee 100644 --- a/tests/models/fuyu/test_processing_fuyu.py +++ b/tests/models/fuyu/test_processing_fuyu.py @@ -1,10 +1,6 @@ -import tempfile import unittest -from shutil import rmtree from transformers import ( - AutoProcessor, - AutoTokenizer, FuyuImageProcessor, FuyuProcessor, is_torch_available, @@ -25,41 +21,24 @@ @require_vision class FuyuProcessingTest(ProcessorTesterMixin, unittest.TestCase): processor_class = FuyuProcessor + model_id = "adept/fuyu-8b" @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - - image_processor = FuyuImageProcessor() - tokenizer = AutoTokenizer.from_pretrained("adept/fuyu-8b") - - processor = FuyuProcessor(image_processor=image_processor, tokenizer=tokenizer) - processor.save_pretrained(cls.tmpdirname) - + def _setup_test_attributes(cls, processor): cls.text_prompt = "Generate a coco-style caption.\\n" bus_image_url = url_to_local_path( "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png" ) cls.bus_image_pil = load_image(bus_image_url) - @classmethod - def tearDownClass(cls): - rmtree(cls.tmpdirname) - - def get_processor(self): - image_processor = FuyuImageProcessor() - tokenizer = AutoTokenizer.from_pretrained("adept/fuyu-8b") - processor = FuyuProcessor(image_processor, tokenizer, **self.prepare_processor_dict()) - - return processor - - def get_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer + @unittest.skip("FuyuProcessor doesn't return typical pixel values for images") + def test_image_processor_defaults(self): + pass - def get_image_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor + @unittest.skip("FuyuProcessor doesn't return typical pixel values for images") + def test_processor_with_multiple_inputs(self): + pass - # Copied from tests.models.llava.test_processing_llava.LlavaProcessorTest.test_get_num_vision_tokens def test_get_num_vision_tokens(self): "Tests general functionality of the helper used internally in vLLM" @@ -91,7 +70,7 @@ def test_fuyu_processing_no_image(self): Test to check processor works with just text input """ processor_outputs = self.get_processor()(text=self.text_prompt) - tokenizer_outputs = self.get_tokenizer()(self.text_prompt) + tokenizer_outputs = self.get_component("tokenizer")(self.text_prompt) self.assertEqual(processor_outputs["input_ids"], tokenizer_outputs["input_ids"]) def test_fuyu_processing_no_text(self): diff --git a/tests/models/gemma3/test_processing_gemma3.py b/tests/models/gemma3/test_processing_gemma3.py index 9e773de46d1b..455731b71ca9 100644 --- a/tests/models/gemma3/test_processing_gemma3.py +++ b/tests/models/gemma3/test_processing_gemma3.py @@ -12,20 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -import shutil -import tempfile import unittest -from transformers import Gemma3Processor, GemmaTokenizer +from transformers import Gemma3Processor from transformers.testing_utils import get_tests_dir, require_vision -from transformers.utils import is_vision_available from ...test_processing_common import ProcessorTesterMixin -if is_vision_available(): - from transformers import Gemma3ImageProcessor - SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @@ -34,30 +28,34 @@ class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = Gemma3Processor @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() + def _setup_test_attributes(cls, processor): + cls.image_token = processor.boi_token + + @classmethod + def _setup_image_processor(cls): + image_processor_class = cls._get_component_class_from_processor("image_processor") gemma3_image_processor_kwargs = { "do_pan_and_scan": True, "pan_and_scan_min_crop_size": 256, "pan_and_scan_max_num_crops": 4, "pan_and_scan_min_ratio_to_activate": 1.2, } - image_processor = Gemma3ImageProcessor.from_pretrained( + image_processor = image_processor_class.from_pretrained( "google/siglip-so400m-patch14-384", **gemma3_image_processor_kwargs ) + return image_processor + @classmethod + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") extra_special_tokens = { "image_token": "", "boi_token": "", "eoi_token": "", } - tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True, extra_special_tokens=extra_special_tokens) - processor_kwargs = cls.prepare_processor_dict() - processor = Gemma3Processor(image_processor=image_processor, tokenizer=tokenizer, **processor_kwargs) - processor.save_pretrained(cls.tmpdirname) - cls.image_token = processor.boi_token + tokenizer = tokenizer_class(SAMPLE_VOCAB, keep_accents=True, extra_special_tokens=extra_special_tokens) + return tokenizer - # Copied from tests.models.llava.test_processing_llava.LlavaProcessorTest.test_get_num_vision_tokens def test_get_num_vision_tokens(self): "Tests general functionality of the helper used internally in vLLM" @@ -70,11 +68,6 @@ def test_get_num_vision_tokens(self): self.assertTrue("num_image_patches" in output) self.assertEqual(len(output["num_image_patches"]), 3) - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname, ignore_errors=True) - - # TODO: raushan or arthur: add the real chat template @staticmethod def prepare_processor_dict(): return { @@ -102,16 +95,16 @@ def test_text_with_image_tokens(self): # If text has no image tokens, image should be `None` with self.assertRaises(ValueError): - _ = processor(text=text_no_image, images=image, return_tensors="np") + _ = processor(text=text_no_image, images=image, return_tensors="pt") # We can't be sure what is users intention: if user wants one image per text OR two images for first text and no image for second text with self.assertRaises(ValueError): - _ = processor(text=[text_single_image, text_single_image], images=[image, image], return_tensors="np") + _ = processor(text=[text_single_image, text_single_image], images=[image, image], return_tensors="pt") # The users is expected to be explicit about which image belong to which text by nesting the images list - out_multiimages = processor(text=text_multi_images, images=[image, image], return_tensors="np") + out_multiimages = processor(text=text_multi_images, images=[image, image], return_tensors="pt") out_batch_oneimage = processor( - text=[text_single_image, text_single_image], images=[[image], [image]], return_tensors="np" + text=[text_single_image, text_single_image], images=[[image], [image]], return_tensors="pt" ) self.assertListEqual( out_batch_oneimage[self.images_input_name].tolist(), out_multiimages[self.images_input_name].tolist() @@ -127,7 +120,7 @@ def test_pan_and_scan(self): inputs = processor( text=input_str, images=image_input, - return_tensors="np", + return_tensors="pt", do_pan_and_scan=True, image_seq_length=2, pan_and_scan_min_crop_size=10, diff --git a/tests/models/gemma3n/test_processing_gemma3n.py b/tests/models/gemma3n/test_processing_gemma3n.py index 2fbe7e79d3e5..65d69172cf82 100644 --- a/tests/models/gemma3n/test_processing_gemma3n.py +++ b/tests/models/gemma3n/test_processing_gemma3n.py @@ -12,21 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -import shutil -import tempfile import unittest -import numpy as np -from parameterized import parameterized - -from transformers import GemmaTokenizerFast, SiglipImageProcessorFast, is_speech_available +from transformers import is_speech_available from transformers.testing_utils import require_sentencepiece, require_torch, require_torchaudio, require_vision +from ...test_processing_common import ProcessorTesterMixin from .test_feature_extraction_gemma3n import floats_list if is_speech_available(): - from transformers.models.gemma3n import Gemma3nAudioFeatureExtractor, Gemma3nProcessor + from transformers.models.gemma3n import Gemma3nProcessor # TODO: omni-modal processor can't run tests from `ProcessorTesterMixin` @@ -34,97 +30,20 @@ @require_torchaudio @require_vision @require_sentencepiece -class Gemma3nProcessorTest(unittest.TestCase): - def setUp(self): - # TODO: update to google? - self.model_id = "hf-internal-testing/namespace-google-repo_name-gemma-3n-E4B-it" - self.tmpdirname = tempfile.mkdtemp(suffix="gemma3n") - self.maxDiff = None - - def get_tokenizer(self, **kwargs): - return GemmaTokenizerFast.from_pretrained(self.model_id, **kwargs) - - def get_feature_extractor(self, **kwargs): - return Gemma3nAudioFeatureExtractor.from_pretrained(self.model_id, **kwargs) - - def get_image_processor(self, **kwargs): - return SiglipImageProcessorFast.from_pretrained(self.model_id, **kwargs) - - def tearDown(self): - shutil.rmtree(self.tmpdirname) - - def test_save_load_pretrained_default(self): - # NOTE: feature_extractor and image_processor both use the same filename, preprocessor_config.json, when saved to - # disk, but the files are overwritten by processor.save_pretrained(). This test does not attempt to address - # this potential issue, and as such, does not guarantee content accuracy. - - tokenizer = self.get_tokenizer() - feature_extractor = self.get_feature_extractor() - image_processor = self.get_image_processor() - - processor = Gemma3nProcessor( - tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor - ) - - processor.save_pretrained(self.tmpdirname, legacy_serialization=False) - processor = Gemma3nProcessor.from_pretrained(self.tmpdirname) - - self.assertIsInstance(processor.tokenizer, GemmaTokenizerFast) - self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab()) - - self.assertIsInstance(processor.feature_extractor, Gemma3nAudioFeatureExtractor) - self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string()) - - def test_save_load_pretrained_additional_features(self): - tokenizer = self.get_tokenizer() - feature_extractor = self.get_feature_extractor() - image_processor = self.get_image_processor() - - processor = Gemma3nProcessor( - tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor - ) - processor.save_pretrained(self.tmpdirname, legacy_serialization=False) - - tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS-BOS)", eos_token="(EOS-EOS)") - feature_extractor_add_kwargs = self.get_feature_extractor(dither=5.0, padding_value=1.0) +class Gemma3nProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = Gemma3nProcessor + model_id = "hf-internal-testing/namespace-google-repo_name-gemma-3n-E4B-it" - processor = Gemma3nProcessor.from_pretrained( - self.tmpdirname, bos_token="(BOS-BOS)", eos_token="(EOS-EOS)", dither=5.0, padding_value=1.0 - ) + def prepare_image_inputs(self, batch_size: int | None = None, nested: bool = False): + return super().prepare_image_inputs(batch_size=batch_size, nested=True) - self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) - self.assertIsInstance(processor.tokenizer, GemmaTokenizerFast) - - self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.feature_extractor, Gemma3nAudioFeatureExtractor) - - @parameterized.expand([256, 512, 768, 1024]) - def test_image_processor(self, image_size: int): - feature_extractor = self.get_feature_extractor() - tokenizer = self.get_tokenizer() - image_processor = self.get_image_processor() - processor = Gemma3nProcessor( - tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor - ) - - raw_image = np.random.randint(0, 256, size=(image_size, image_size, 3), dtype=np.uint8) - input_image_processor = image_processor(raw_image, return_tensors="pt") - input_processor = processor(text="Describe:", images=raw_image, return_tensors="pt") - - for key in input_image_processor: - self.assertAlmostEqual(input_image_processor[key].sum(), input_processor[key].sum(), delta=1e-2) - if "pixel_values" in key: - # NOTE: all images should be re-scaled to 768x768 - self.assertEqual(input_image_processor[key].shape, (1, 3, 768, 768)) - self.assertEqual(input_processor[key].shape, (1, 3, 768, 768)) + @classmethod + def _setup_test_attributes(cls, processor): + cls.image_token = processor.boi_token def test_audio_feature_extractor(self): - feature_extractor = self.get_feature_extractor() - tokenizer = self.get_tokenizer() - image_processor = self.get_image_processor() - processor = Gemma3nProcessor( - tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor - ) + processor = self.get_processor() + feature_extractor = self.get_component("feature_extractor") raw_speech = floats_list((3, 1000)) input_feat_extract = feature_extractor(raw_speech, return_tensors="pt") @@ -132,35 +51,3 @@ def test_audio_feature_extractor(self): for key in input_feat_extract: self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) - - def test_tokenizer(self): - feature_extractor = self.get_feature_extractor() - tokenizer = self.get_tokenizer() - image_processor = self.get_image_processor() - processor = Gemma3nProcessor( - tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor - ) - - input_str = "This is a test string" - - encoded_processor = processor(text=input_str) - - encoded_tok = tokenizer(input_str) - - for key in encoded_tok: - self.assertListEqual(encoded_tok[key], encoded_processor[key][0]) - - def test_tokenizer_decode(self): - feature_extractor = self.get_feature_extractor() - tokenizer = self.get_tokenizer() - image_processor = self.get_image_processor() - processor = Gemma3nProcessor( - tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor - ) - - predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] - - decoded_processor = processor.batch_decode(predicted_ids) - decoded_tok = tokenizer.batch_decode(predicted_ids) - - self.assertListEqual(decoded_tok, decoded_processor) diff --git a/tests/models/git/test_processing_git.py b/tests/models/git/test_processing_git.py index 5e06636007bc..2ad7029e46e5 100644 --- a/tests/models/git/test_processing_git.py +++ b/tests/models/git/test_processing_git.py @@ -11,12 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import shutil -import tempfile import unittest -import pytest - from transformers.testing_utils import require_vision from transformers.utils import is_vision_available @@ -24,7 +20,7 @@ if is_vision_available(): - from transformers import AutoProcessor, BertTokenizer, CLIPImageProcessor, GitProcessor, PreTrainedTokenizerFast + from transformers import GitProcessor @require_vision @@ -32,101 +28,8 @@ class GitProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = GitProcessor @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - - image_processor = CLIPImageProcessor() - tokenizer = BertTokenizer.from_pretrained( + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") + return tokenizer_class.from_pretrained( "hf-internal-testing/tiny-random-BertModel", model_input_names=["input_ids", "attention_mask"] ) - - processor = GitProcessor(image_processor, tokenizer) - - processor.save_pretrained(cls.tmpdirname) - - def get_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer - - def get_image_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor - - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname, ignore_errors=True) - - def test_save_load_pretrained_additional_features(self): - with tempfile.TemporaryDirectory() as tmpdir: - processor = GitProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()) - processor.save_pretrained(tmpdir) - - tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") - image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0) - - processor = GitProcessor.from_pretrained( - tmpdir, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 - ) - - self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) - self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast) - - self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, CLIPImageProcessor) - - def test_image_processor(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = GitProcessor(tokenizer=tokenizer, image_processor=image_processor) - - image_input = self.prepare_image_inputs() - - input_feat_extract = image_processor(image_input, return_tensors="np") - input_processor = processor(images=image_input, return_tensors="np") - - for key in input_feat_extract: - self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) - - def test_tokenizer(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = GitProcessor(tokenizer=tokenizer, image_processor=image_processor) - - input_str = "lower newer" - - encoded_processor = processor(text=input_str) - - encoded_tok = tokenizer(input_str, return_token_type_ids=False) - - for key in encoded_tok: - self.assertListEqual(encoded_tok[key], encoded_processor[key]) - - def test_processor(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = GitProcessor(tokenizer=tokenizer, image_processor=image_processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input) - - self.assertSetEqual(set(inputs.keys()), {"input_ids", "attention_mask", "pixel_values"}) - - # test if it raises when no input is passed - with pytest.raises(ValueError): - processor() - - def test_tokenizer_decode(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = GitProcessor(tokenizer=tokenizer, image_processor=image_processor) - - predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] - - decoded_processor = processor.batch_decode(predicted_ids) - decoded_tok = tokenizer.batch_decode(predicted_ids) - - self.assertListEqual(decoded_tok, decoded_processor) diff --git a/tests/models/glm46v/test_processor_glm46v.py b/tests/models/glm46v/test_processor_glm46v.py index 268f20d89c89..344e2e293727 100644 --- a/tests/models/glm46v/test_processor_glm46v.py +++ b/tests/models/glm46v/test_processor_glm46v.py @@ -13,13 +13,10 @@ # limitations under the License. import inspect -import shutil -import tempfile import unittest import numpy as np -from transformers import AutoProcessor from transformers.testing_utils import require_av, require_torch, require_vision from transformers.utils import is_torch_available, is_vision_available @@ -37,31 +34,21 @@ @require_torch class Glm46VProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = Glm46VProcessor + model_id = "THUDM/GLM-4.1V-9B-Thinking" @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - processor = Glm46VProcessor.from_pretrained( - "THUDM/GLM-4.1V-9B-Thinking", patch_size=4, size={"shortest_edge": 12 * 12, "longest_edge": 18 * 18} - ) - processor.save_pretrained(cls.tmpdirname) + def _setup_test_attributes(cls, processor): cls.image_token = processor.image_token - def get_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer - - def get_image_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor - - def get_video_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor - - def get_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs) - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname, ignore_errors=True) + def _setup_from_pretrained(cls, model_id, **kwargs): + return super()._setup_from_pretrained( + model_id, + do_sample_frames=False, + patch_size=4, + size={"shortest_edge": 12 * 12, "longest_edge": 18 * 18}, + **kwargs, + ) @require_torch @require_av diff --git a/tests/models/glm4v/test_processor_glm4v.py b/tests/models/glm4v/test_processor_glm4v.py index 0b52faa66b3c..5acf39e6e731 100644 --- a/tests/models/glm4v/test_processor_glm4v.py +++ b/tests/models/glm4v/test_processor_glm4v.py @@ -13,13 +13,10 @@ # limitations under the License. import inspect -import shutil -import tempfile import unittest import numpy as np -from transformers import AutoProcessor from transformers.testing_utils import require_av, require_torch, require_vision from transformers.utils import is_torch_available, is_vision_available @@ -37,31 +34,21 @@ @require_torch class Glm4vProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = Glm4vProcessor + model_id = "THUDM/GLM-4.1V-9B-Thinking" @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - processor = Glm4vProcessor.from_pretrained( - "THUDM/GLM-4.1V-9B-Thinking", patch_size=4, size={"shortest_edge": 12 * 12, "longest_edge": 18 * 18} - ) - processor.save_pretrained(cls.tmpdirname) + def _setup_test_attributes(cls, processor): cls.image_token = processor.image_token - def get_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer - - def get_image_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor - - def get_video_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor - - def get_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs) - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname, ignore_errors=True) + def _setup_from_pretrained(cls, model_id, **kwargs): + return super()._setup_from_pretrained( + model_id, + do_sample_frames=False, + patch_size=4, + size={"shortest_edge": 12 * 12, "longest_edge": 18 * 18}, + **kwargs, + ) @require_torch @require_av @@ -267,13 +254,13 @@ def test_apply_chat_template_video_frame_sampling(self): do_sample_frames=True, ) - def test_model_input_names(self): - processor = self.get_processor() + # def test_model_input_names(self): + # processor = self.get_processor() - text = self.prepare_text_inputs(modalities=["image", "video"]) - image_input = self.prepare_image_inputs() - video_inputs = self.prepare_video_inputs() - inputs_dict = {"text": text, "images": image_input, "videos": video_inputs} - inputs = processor(**inputs_dict, return_tensors="pt", do_sample_frames=False) + # text = self.prepare_text_inputs(modalities=["image", "video"]) + # image_input = self.prepare_image_inputs() + # video_inputs = self.prepare_video_inputs() + # inputs_dict = {"text": text, "images": image_input, "videos": video_inputs} + # inputs = processor(**inputs_dict, return_tensors="pt", do_sample_frames=False) - self.assertSetEqual(set(inputs.keys()), set(processor.model_input_names)) + # self.assertSetEqual(set(inputs.keys()), set(processor.model_input_names)) diff --git a/tests/models/got_ocr2/test_processing_got_ocr2.py b/tests/models/got_ocr2/test_processing_got_ocr2.py index ffa0f97cd4e4..497c5eea125b 100644 --- a/tests/models/got_ocr2/test_processing_got_ocr2.py +++ b/tests/models/got_ocr2/test_processing_got_ocr2.py @@ -12,45 +12,27 @@ # See the License for the specific language governing permissions and # limitations under the License. -import shutil -import tempfile import unittest -from transformers import AutoProcessor, GotOcr2Processor, PreTrainedTokenizerFast +from transformers import GotOcr2Processor from transformers.testing_utils import require_vision -from transformers.utils import is_vision_available from ...test_processing_common import ProcessorTesterMixin -if is_vision_available(): - from transformers import GotOcr2ImageProcessor - - @require_vision class GotOcr2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = GotOcr2Processor @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - - image_processor = GotOcr2ImageProcessor() - tokenizer = PreTrainedTokenizerFast.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf") - processor_kwargs = {} - processor = GotOcr2Processor(image_processor, tokenizer, **processor_kwargs) - processor.save_pretrained(cls.tmpdirname) - cls.image_token = processor.img_pad_token - - def get_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer - - def get_image_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor - - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname, ignore_errors=True) + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") + tokenizer = tokenizer_class.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf") + return tokenizer + + @unittest.skip("GotOcr2Processor pop the image processor output 'num_patches'") + def test_image_processor_defaults(self): + pass def test_ocr_queries(self): processor = self.get_processor() diff --git a/tests/models/grounding_dino/test_processing_grounding_dino.py b/tests/models/grounding_dino/test_processing_grounding_dino.py index 30a478ada427..6ebb4d82fc09 100644 --- a/tests/models/grounding_dino/test_processing_grounding_dino.py +++ b/tests/models/grounding_dino/test_processing_grounding_dino.py @@ -13,16 +13,12 @@ # limitations under the License. import os -import shutil -import tempfile import unittest -import pytest - -from transformers import BertTokenizer, BertTokenizerFast, GroundingDinoProcessor +from transformers import GroundingDinoProcessor from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES from transformers.testing_utils import require_torch, require_vision -from transformers.utils import is_torch_available, is_vision_available +from transformers.utils import is_torch_available from ...test_processing_common import ProcessorTesterMixin @@ -32,26 +28,21 @@ from transformers.models.grounding_dino.modeling_grounding_dino import GroundingDinoObjectDetectionOutput -if is_vision_available(): - from transformers import GroundingDinoImageProcessor - @require_torch @require_vision class GroundingDinoProcessorTest(ProcessorTesterMixin, unittest.TestCase): - from_pretrained_id = "IDEA-Research/grounding-dino-base" + model_id = "IDEA-Research/grounding-dino-base" processor_class = GroundingDinoProcessor + batch_size = 7 + num_queries = 5 + embed_dim = 5 + seq_length = 5 @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - - vocab_tokens = ["[UNK]","[CLS]","[SEP]","[PAD]","[MASK]","want","##want","##ed","wa","un","runn","##ing",",","low","lowest"] # fmt: skip - cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer: - vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) - - image_processor = GroundingDinoImageProcessor( + def _setup_image_processor(cls): + image_processor_class = cls._get_component_class_from_processor("image_processor") + return image_processor_class( do_resize=True, size=None, do_normalize=True, @@ -61,16 +52,19 @@ def setUpClass(cls): rescale_factor=1 / 255, do_pad=True, ) - tokenizer = BertTokenizer.from_pretrained(cls.from_pretrained_id) - - processor = GroundingDinoProcessor(image_processor, tokenizer) - processor.save_pretrained(cls.tmpdirname) + @classmethod + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") + vocab_tokens = ["[UNK]","[CLS]","[SEP]","[PAD]","[MASK]","want","##want","##ed","wa","un","runn","##ing",",","low","lowest"] # fmt: skip + vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(vocab_file, "w", encoding="utf-8") as vocab_writer: + vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + return tokenizer_class.from_pretrained(cls.tmpdirname) - cls.batch_size = 7 - cls.num_queries = 5 - cls.embed_dim = 5 - cls.seq_length = 5 + @unittest.skip("GroundingDinoProcessor merges candidate labels text") + def test_tokenizer_defaults(self): + pass def prepare_text_inputs(self, batch_size: int | None = None, **kwargs): labels = ["a cat", "remote control"] @@ -86,25 +80,6 @@ def prepare_text_inputs(self, batch_size: int | None = None, **kwargs): return [labels] return [labels, labels_longer] + [labels] * (batch_size - 2) - @classmethod - # Copied from tests.models.clip.test_processing_clip.CLIPProcessorTest.get_tokenizer with CLIP->Bert - def get_tokenizer(cls, **kwargs): - return BertTokenizer.from_pretrained(cls.tmpdirname, **kwargs) - - @classmethod - # Copied from tests.models.clip.test_processing_clip.CLIPProcessorTest.get_rust_tokenizer with CLIP->Bert - def get_rust_tokenizer(cls, **kwargs): - return BertTokenizerFast.from_pretrained(cls.tmpdirname, **kwargs) - - @classmethod - # Copied from tests.models.clip.test_processing_clip.CLIPProcessorTest.get_image_processor with CLIP->GroundingDino - def get_image_processor(cls, **kwargs): - return GroundingDinoImageProcessor.from_pretrained(cls.tmpdirname, **kwargs) - - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname, ignore_errors=True) - def get_fake_grounding_dino_output(self): torch.manual_seed(42) return GroundingDinoObjectDetectionOutput( @@ -118,10 +93,7 @@ def get_fake_grounding_dino_input_ids(self): return torch.stack([input_ids] * self.batch_size, dim=0) def test_post_process_grounded_object_detection(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor) + processor = self.get_processor() grounding_dino_output = self.get_fake_grounding_dino_output() @@ -138,121 +110,8 @@ def test_post_process_grounded_object_detection(self): expected_box_slice = torch.tensor([0.6908, 0.4354, 1.0737, 1.3947]) torch.testing.assert_close(post_processed[0]["boxes"][0], expected_box_slice, rtol=1e-4, atol=1e-4) - # Copied from tests.models.clip.test_processing_clip.CLIPProcessorTest.test_save_load_pretrained_default with CLIP->GroundingDino,GroundingDinoTokenizer->BertTokenizer - def test_save_load_pretrained_default(self): - tokenizer_slow = self.get_tokenizer() - tokenizer_fast = self.get_rust_tokenizer() - image_processor = self.get_image_processor() - - with tempfile.TemporaryDirectory() as tmpdir: - processor_slow = GroundingDinoProcessor(tokenizer=tokenizer_slow, image_processor=image_processor) - processor_slow.save_pretrained(tmpdir) - processor_slow = GroundingDinoProcessor.from_pretrained(tmpdir, use_fast=False) - - processor_fast = GroundingDinoProcessor(tokenizer=tokenizer_fast, image_processor=image_processor) - processor_fast.save_pretrained(tmpdir) - processor_fast = GroundingDinoProcessor.from_pretrained(tmpdir) - - self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab()) - self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab()) - self.assertEqual(tokenizer_slow.get_vocab(), tokenizer_fast.get_vocab()) - self.assertIsInstance(processor_slow.tokenizer, BertTokenizer) - self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast) - - self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string()) - self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string()) - self.assertIsInstance(processor_slow.image_processor, GroundingDinoImageProcessor) - self.assertIsInstance(processor_fast.image_processor, GroundingDinoImageProcessor) - - # Copied from tests.models.clip.test_processing_clip.CLIPProcessorTest.test_save_load_pretrained_additional_features with CLIP->GroundingDino,GroundingDinoTokenizer->BertTokenizer - def test_save_load_pretrained_additional_features(self): - with tempfile.TemporaryDirectory() as tmpdir: - processor = GroundingDinoProcessor( - tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor() - ) - processor.save_pretrained(tmpdir) - - tokenizer_add_kwargs = BertTokenizer.from_pretrained(tmpdir, bos_token="(BOS)", eos_token="(EOS)") - image_processor_add_kwargs = GroundingDinoImageProcessor.from_pretrained( - tmpdir, do_normalize=False, padding_value=1.0 - ) - - processor = GroundingDinoProcessor.from_pretrained( - tmpdir, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 - ) - - self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) - self.assertIsInstance(processor.tokenizer, BertTokenizerFast) - - self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, GroundingDinoImageProcessor) - - # Copied from tests.models.clip.test_processing_clip.CLIPProcessorTest.test_image_processor with CLIP->GroundingDino - def test_image_processor(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor) - - image_input = self.prepare_image_inputs() - - input_image_proc = image_processor(image_input, return_tensors="np") - input_processor = processor(images=image_input, return_tensors="np") - - for key in input_image_proc: - self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2) - - # Copied from tests.models.clip.test_processing_clip.CLIPProcessorTest.test_tokenizer with CLIP->GroundingDino - def test_tokenizer(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor) - - input_str = "lower newer" - - encoded_processor = processor(text=input_str) - - encoded_tok = tokenizer(input_str) - - for key in encoded_tok: - self.assertListEqual(encoded_tok[key], encoded_processor[key]) - - def test_processor(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input) - - self.assertSetEqual( - set(inputs.keys()), {"input_ids", "token_type_ids", "attention_mask", "pixel_values", "pixel_mask"} - ) - - # test if it raises when no input is passed - with pytest.raises(ValueError): - processor() - - # Copied from tests.models.clip.test_processing_clip.CLIPProcessorTest.test_tokenizer_decode with CLIP->GroundingDino - def test_tokenizer_decode(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor) - - predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] - - decoded_processor = processor.batch_decode(predicted_ids) - decoded_tok = tokenizer.batch_decode(predicted_ids) - - self.assertListEqual(decoded_tok, decoded_processor) - def test_text_preprocessing_equivalence(self): - processor = GroundingDinoProcessor.from_pretrained(self.tmpdirname) + processor = self.get_processor() # check for single input formatted_labels = "a cat. a remote control." diff --git a/tests/models/idefics/test_processing_idefics.py b/tests/models/idefics/test_processing_idefics.py index eeb043e04540..ceb5a0f0a65c 100644 --- a/tests/models/idefics/test_processing_idefics.py +++ b/tests/models/idefics/test_processing_idefics.py @@ -12,18 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import shutil -import tempfile import unittest import numpy as np from transformers import ( - AutoProcessor, - IdeficsImageProcessor, IdeficsProcessor, - LlamaTokenizerFast, - PreTrainedTokenizerFast, ) from transformers.testing_utils import require_torch, require_vision from transformers.utils import is_torch_available, is_vision_available @@ -32,7 +26,7 @@ if is_torch_available(): - import torch + pass if is_vision_available(): from PIL import Image @@ -42,29 +36,17 @@ @require_vision class IdeficsProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = IdeficsProcessor + input_keys = ["pixel_values", "input_ids", "attention_mask", "image_attention_mask"] @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - - image_processor = IdeficsImageProcessor(return_tensors="pt") - tokenizer = LlamaTokenizerFast.from_pretrained("HuggingFaceM4/tiny-random-idefics") - - processor = IdeficsProcessor(image_processor, tokenizer) - - processor.save_pretrained(cls.tmpdirname) - - cls.input_keys = ["pixel_values", "input_ids", "attention_mask", "image_attention_mask"] - - def get_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer - - def get_image_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor + def _setup_image_processor(cls): + image_processor_class = cls._get_component_class_from_processor("image_processor") + return image_processor_class(return_tensors="pt") @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname, ignore_errors=True) + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") + return tokenizer_class.from_pretrained("HuggingFaceM4/tiny-random-idefics") def prepare_prompts(self): """This function prepares a list of PIL images""" @@ -109,52 +91,21 @@ def prepare_prompts(self): return prompts def test_save_load_pretrained_additional_features(self): - with tempfile.TemporaryDirectory() as tmpdir: - processor = IdeficsProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()) - processor.save_pretrained(tmpdir) - - tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") - image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0) - - processor = IdeficsProcessor.from_pretrained( - tmpdir, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 - ) + tokenizer_add_kwargs = self.get_component("tokenizer", bos_token="(BOS)", eos_token="(EOS)") + image_processor_add_kwargs = self.get_component("image_processor", do_normalize=False, padding_value=1.0) + processor = IdeficsProcessor.from_pretrained( + self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 + ) self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) - self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast) + self.assertIsInstance(processor.tokenizer, self._get_component_class_from_processor("tokenizer")) self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, IdeficsImageProcessor) - - def test_processor(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = IdeficsProcessor(tokenizer=tokenizer, image_processor=image_processor) - - prompts = self.prepare_prompts() - - # test that all prompts succeeded - input_processor = processor(text=prompts, return_tensors="pt", padding="longest") - for key in self.input_keys: - assert torch.is_tensor(input_processor[key]) - - def test_tokenizer_decode(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = IdeficsProcessor(tokenizer=tokenizer, image_processor=image_processor, return_tensors="pt") - - predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] - - decoded_processor = processor.batch_decode(predicted_ids) - decoded_tok = tokenizer.batch_decode(predicted_ids) - - self.assertListEqual(decoded_tok, decoded_processor) + self.assertIsInstance(processor.image_processor, self._get_component_class_from_processor("image_processor")) def test_tokenizer_padding(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer(padding_side="right") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer", padding_side="right") processor = IdeficsProcessor(tokenizer=tokenizer, image_processor=image_processor, return_tensors="pt") @@ -182,10 +133,7 @@ def test_tokenizer_padding(self): def test_tokenizer_left_padding(self): """Identical to test_tokenizer_padding, but with padding_side not explicitly set.""" - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = IdeficsProcessor(tokenizer=tokenizer, image_processor=image_processor) + processor = self.get_processor() predicted_tokens = [ " Describe this image.\nAssistant:", diff --git a/tests/models/idefics2/test_processing_idefics2.py b/tests/models/idefics2/test_processing_idefics2.py index d3f816c15405..1ad5de01f83c 100644 --- a/tests/models/idefics2/test_processing_idefics2.py +++ b/tests/models/idefics2/test_processing_idefics2.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import shutil -import tempfile import unittest from transformers import Idefics2Processor @@ -26,7 +24,6 @@ if is_vision_available(): from transformers import ( - AutoProcessor, Idefics2Processor, ) @@ -35,15 +32,10 @@ @require_vision class Idefics2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = Idefics2Processor + model_id = "HuggingFaceM4/idefics2-8b" @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - - processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b", image_seq_len=2) - - processor.save_pretrained(cls.tmpdirname) - + def _setup_test_attributes(cls, processor): cls.image1 = load_image( url_to_local_path( "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" @@ -60,35 +52,18 @@ def setUpClass(cls): cls.bos_token = processor.tokenizer.bos_token cls.image_token = processor.image_token cls.fake_image_token = processor.fake_image_token - cls.bos_token_id = processor.tokenizer.convert_tokens_to_ids(cls.bos_token) cls.image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.image_token) cls.fake_image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.fake_image_token) cls.image_seq_len = processor.image_seq_len - def get_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer - - def get_image_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor - - def get_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs) - @staticmethod def prepare_processor_dict(): return {"image_seq_len": 2} - @classmethod - def tearDownClass(cls): - cls.image1.close() - cls.image2.close() - cls.image3.close() - shutil.rmtree(cls.tmpdirname, ignore_errors=True) - def test_process_interleaved_images_prompts_no_image_splitting(self): - tokenizer = self.get_tokenizer() processor = self.get_processor() + tokenizer = processor.tokenizer processor.image_processor.do_image_splitting = False @@ -148,7 +123,7 @@ def test_process_interleaved_images_prompts_no_image_splitting(self): def test_process_interleaved_images_prompts_image_splitting(self): processor = self.get_processor() - tokenizer = self.get_tokenizer() + tokenizer = processor.tokenizer processor.image_processor.do_image_splitting = True # Test that a single image is processed correctly @@ -207,7 +182,7 @@ def test_process_interleaved_images_prompts_image_splitting(self): def test_add_special_tokens_processor(self): processor = self.get_processor() - tokenizer = self.get_tokenizer() + tokenizer = processor.tokenizer image_str = "" text_str = "In this image, we see" text = text_str + image_str diff --git a/tests/models/idefics3/test_processing_idefics3.py b/tests/models/idefics3/test_processing_idefics3.py index 6a14dda4af87..a7530fd7f01e 100644 --- a/tests/models/idefics3/test_processing_idefics3.py +++ b/tests/models/idefics3/test_processing_idefics3.py @@ -12,15 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import shutil -import tempfile import unittest import numpy as np from transformers import Idefics3Processor from transformers.image_utils import load_image -from transformers.models.auto.processing_auto import AutoProcessor from transformers.testing_utils import require_torch, require_vision from ...test_processing_common import ProcessorTesterMixin, url_to_local_path @@ -30,12 +27,10 @@ @require_vision class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = Idefics3Processor + model_id = "HuggingFaceM4/Idefics3-8B-Llama3" @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - processor = Idefics3Processor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3", image_seq_len=2) - processor.save_pretrained(cls.tmpdirname) + def _setup_test_attributes(cls, processor): cls.image1 = load_image( url_to_local_path( "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" @@ -61,15 +56,6 @@ def setUpClass(cls): cls.padding_token_id = processor.tokenizer.pad_token_id cls.image_seq_len = processor.image_seq_len - def get_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer - - def get_image_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor - - def get_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs) - @staticmethod def prepare_processor_dict(): return {"image_seq_len": 2} @@ -108,13 +94,6 @@ def get_split_image_expected_tokens(self, processor, image_rows, image_cols): ) return text_split_images - @classmethod - def tearDownClass(cls): - cls.image1.close() - cls.image2.close() - cls.image3.close() - shutil.rmtree(cls.tmpdirname, ignore_errors=True) - def test_process_interleaved_images_prompts_no_image_splitting(self): processor = self.get_processor() processor.image_processor.do_image_splitting = False diff --git a/tests/models/instructblip/test_processing_instructblip.py b/tests/models/instructblip/test_processing_instructblip.py index 019fe85f72e1..e5ce27e3c281 100644 --- a/tests/models/instructblip/test_processing_instructblip.py +++ b/tests/models/instructblip/test_processing_instructblip.py @@ -11,12 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import shutil -import tempfile import unittest -import pytest - from transformers.testing_utils import require_vision from transformers.utils import is_vision_available @@ -25,12 +21,7 @@ if is_vision_available(): from transformers import ( - AutoProcessor, - BertTokenizerFast, - BlipImageProcessor, - GPT2Tokenizer, InstructBlipProcessor, - PreTrainedTokenizerFast, ) @@ -39,120 +30,15 @@ class InstructBlipProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = InstructBlipProcessor @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - - image_processor = BlipImageProcessor() - tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-GPT2Model") - qformer_tokenizer = BertTokenizerFast.from_pretrained("hf-internal-testing/tiny-random-bert") - - processor = InstructBlipProcessor(image_processor, tokenizer, qformer_tokenizer) - - processor.save_pretrained(cls.tmpdirname) - - def get_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer - - def get_image_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor - - def get_qformer_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).qformer_tokenizer - - def prepare_processor_dict(self): - return {"num_query_tokens": 1} + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") + return tokenizer_class.from_pretrained("hf-internal-testing/tiny-random-GPT2Model") @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname, ignore_errors=True) - - def test_save_load_pretrained_additional_features(self): - processor = InstructBlipProcessor( - tokenizer=self.get_tokenizer(), - image_processor=self.get_image_processor(), - qformer_tokenizer=self.get_qformer_tokenizer(), - ) - with tempfile.TemporaryDirectory() as tmpdir: - processor.save_pretrained(tmpdir) - - tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") - image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0) - - processor = InstructBlipProcessor.from_pretrained( - tmpdir, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 - ) - - self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) - self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast) - - self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, BlipImageProcessor) - self.assertIsInstance(processor.qformer_tokenizer, BertTokenizerFast) - - def test_image_processor(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - qformer_tokenizer = self.get_qformer_tokenizer() - processor_kwargs = self.prepare_processor_dict() + def _setup_qformer_tokenizer(cls): + qformer_tokenizer_class = cls._get_component_class_from_processor("qformer_tokenizer") + return qformer_tokenizer_class.from_pretrained("hf-internal-testing/tiny-random-bert") - processor = InstructBlipProcessor( - tokenizer=tokenizer, - image_processor=image_processor, - qformer_tokenizer=qformer_tokenizer, - **processor_kwargs, - ) - - image_input = self.prepare_image_inputs() - - input_feat_extract = image_processor(image_input, return_tensors="np") - input_processor = processor(images=image_input, return_tensors="np") - - for key in input_feat_extract: - self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) - - def test_processor(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - qformer_tokenizer = self.get_qformer_tokenizer() - processor_kwargs = self.prepare_processor_dict() - - processor = InstructBlipProcessor( - tokenizer=tokenizer, - image_processor=image_processor, - qformer_tokenizer=qformer_tokenizer, - **processor_kwargs, - ) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input) - - self.assertListEqual( - list(inputs.keys()), - ["qformer_input_ids", "qformer_attention_mask", "input_ids", "attention_mask", "pixel_values"], - ) - - # test if it raises when no input is passed - with pytest.raises(ValueError): - processor() - - def test_tokenizer_decode(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - qformer_tokenizer = self.get_qformer_tokenizer() - processor_kwargs = self.prepare_processor_dict() - - processor = InstructBlipProcessor( - tokenizer=tokenizer, - image_processor=image_processor, - qformer_tokenizer=qformer_tokenizer, - **processor_kwargs, - ) - - predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] - - decoded_processor = processor.batch_decode(predicted_ids) - decoded_tok = tokenizer.batch_decode(predicted_ids) - - self.assertListEqual(decoded_tok, decoded_processor) + @staticmethod + def prepare_processor_dict(): + return {"num_query_tokens": 1} diff --git a/tests/models/instructblipvideo/test_processing_instructblipvideo.py b/tests/models/instructblipvideo/test_processing_instructblipvideo.py index dc476ff2436f..74e1810a3f29 100644 --- a/tests/models/instructblipvideo/test_processing_instructblipvideo.py +++ b/tests/models/instructblipvideo/test_processing_instructblipvideo.py @@ -11,12 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import shutil -import tempfile import unittest -import pytest - from transformers.testing_utils import require_torch, require_vision from transformers.utils import is_torchvision_available, is_vision_available @@ -25,15 +21,11 @@ if is_vision_available(): from transformers import ( - AutoProcessor, - BertTokenizerFast, - GPT2Tokenizer, InstructBlipVideoProcessor, - PreTrainedTokenizerFast, ) if is_torchvision_available(): - from transformers import InstructBlipVideoVideoProcessor + pass @require_vision @@ -42,144 +34,19 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = InstructBlipVideoProcessor @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - - video_processor = InstructBlipVideoVideoProcessor() - tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-GPT2Model") - qformer_tokenizer = BertTokenizerFast.from_pretrained("hf-internal-testing/tiny-random-bert") - - processor = InstructBlipVideoProcessor(video_processor, tokenizer, qformer_tokenizer) - - processor.save_pretrained(cls.tmpdirname) - - def get_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer - - def get_qformer_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).qformer_tokenizer - - def prepare_processor_dict(self): - return {"num_query_tokens": 1} - - def get_video_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") + return tokenizer_class.from_pretrained("hf-internal-testing/tiny-random-GPT2Model") @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname, ignore_errors=True) - - def test_save_load_pretrained_additional_features(self): - processor = InstructBlipVideoProcessor( - tokenizer=self.get_tokenizer(), - video_processor=self.get_video_processor(), - qformer_tokenizer=self.get_qformer_tokenizer(), - ) - with tempfile.TemporaryDirectory() as tmpdir: - processor.save_pretrained(tmpdir) - - tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") - video_processor_add_kwargs = self.get_video_processor(do_normalize=False, padding_value=1.0) - - processor = InstructBlipVideoProcessor.from_pretrained( - tmpdir, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 - ) - - self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) - self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast) - - self.assertEqual(processor.video_processor.to_json_string(), video_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.video_processor, InstructBlipVideoVideoProcessor) - self.assertIsInstance(processor.qformer_tokenizer, BertTokenizerFast) - - def test_video_processor(self): - video_processor = self.get_video_processor() - tokenizer = self.get_tokenizer() - qformer_tokenizer = self.get_qformer_tokenizer() - processor_kwargs = self.prepare_processor_dict() - - processor = InstructBlipVideoProcessor( - tokenizer=tokenizer, - video_processor=video_processor, - qformer_tokenizer=qformer_tokenizer, - **processor_kwargs, - ) - - image_input = self.prepare_image_inputs() + def _setup_qformer_tokenizer(cls): + qformer_tokenizer_class = cls._get_component_class_from_processor("qformer_tokenizer") + return qformer_tokenizer_class.from_pretrained("hf-internal-testing/tiny-random-bert") - input_feat_extract = video_processor(image_input, return_tensors="pt") - input_processor = processor(images=image_input, return_tensors="pt") - - for key in input_feat_extract: - self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) - - def test_tokenizer(self): - video_processor = self.get_video_processor() - tokenizer = self.get_tokenizer() - qformer_tokenizer = self.get_qformer_tokenizer() - processor_kwargs = self.prepare_processor_dict() - - processor = InstructBlipVideoProcessor( - tokenizer=tokenizer, - video_processor=video_processor, - qformer_tokenizer=qformer_tokenizer, - **processor_kwargs, - ) - - input_str = ["lower newer"] - encoded_processor = processor(text=input_str) - encoded_tokens = tokenizer(input_str, return_token_type_ids=False) - encoded_tokens_qformer = qformer_tokenizer(input_str, return_token_type_ids=False) - - for key in encoded_tokens: - self.assertListEqual(encoded_tokens[key], encoded_processor[key]) - - for key in encoded_tokens_qformer: - self.assertListEqual(encoded_tokens_qformer[key], encoded_processor["qformer_" + key]) - - def test_processor(self): - video_processor = self.get_video_processor() - tokenizer = self.get_tokenizer() - qformer_tokenizer = self.get_qformer_tokenizer() - processor_kwargs = self.prepare_processor_dict() - - processor = InstructBlipVideoProcessor( - tokenizer=tokenizer, - video_processor=video_processor, - qformer_tokenizer=qformer_tokenizer, - **processor_kwargs, - ) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input) - - self.assertListEqual( - list(inputs.keys()), - ["qformer_input_ids", "qformer_attention_mask", "input_ids", "attention_mask", "pixel_values"], - ) - - # test if it raises when no input is passed - with pytest.raises(ValueError): - processor() - - def test_tokenizer_decode(self): - video_processor = self.get_video_processor() - tokenizer = self.get_tokenizer() - qformer_tokenizer = self.get_qformer_tokenizer() - processor_kwargs = self.prepare_processor_dict() - - processor = InstructBlipVideoProcessor( - tokenizer=tokenizer, - video_processor=video_processor, - qformer_tokenizer=qformer_tokenizer, - **processor_kwargs, - ) - - predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] - - decoded_processor = processor.batch_decode(predicted_ids) - decoded_tok = tokenizer.batch_decode(predicted_ids) + @staticmethod + def prepare_processor_dict(): + return {"num_query_tokens": 1} - self.assertListEqual(decoded_tok, decoded_processor) + @unittest.skip("InstructBlipVideoProcessor takes in 'images' instead of 'videos' (legacy)") + def test_processor_with_multiple_inputs(self): + pass diff --git a/tests/models/internvl/test_processing_internvl.py b/tests/models/internvl/test_processing_internvl.py index 154b02b17da8..1432f769a1d3 100644 --- a/tests/models/internvl/test_processing_internvl.py +++ b/tests/models/internvl/test_processing_internvl.py @@ -13,15 +13,13 @@ # limitations under the License. import inspect -import shutil -import tempfile import unittest from parameterized import parameterized -from transformers import AutoProcessor, AutoTokenizer, InternVLProcessor +from transformers import InternVLProcessor from transformers.testing_utils import require_av, require_torch, require_vision -from transformers.utils import is_torch_available, is_vision_available +from transformers.utils import is_torch_available from ...test_processing_common import ProcessorTesterMixin, url_to_local_path @@ -30,32 +28,30 @@ import torch -if is_vision_available(): - from transformers import GotOcr2ImageProcessor, InternVLVideoProcessor - - @require_vision class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = InternVLProcessor videos_input_name = "pixel_values" @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - - image_processor = GotOcr2ImageProcessor( + def _setup_image_processor(cls): + image_processor_class = cls._get_component_class_from_processor("image_processor") + return image_processor_class( do_resize=True, size={"height": 20, "width": 20}, max_patches=2, do_rescale=True, rescale_factor=1 / 255, do_normalize=True, - do_center_crop=True, image_mean=[0.485, 0.456, 0.406], image_std=[0.229, 0.224, 0.225], do_convert_rgb=True, ) - video_processor = InternVLVideoProcessor( + + @classmethod + def _setup_video_processor(cls): + video_processor_class = cls._get_component_class_from_processor("video_processor") + return video_processor_class( do_resize=True, size={"height": 20, "width": 20}, do_rescale=True, @@ -65,38 +61,25 @@ def setUpClass(cls): image_std=[0.229, 0.224, 0.225], do_convert_rgb=True, ) - tokenizer = AutoTokenizer.from_pretrained("OpenGVLab/InternVL3-1B-hf", padding_side="left") - processor_kwargs = cls.prepare_processor_dict() - processor = InternVLProcessor( - image_processor=image_processor, - tokenizer=tokenizer, - video_processor=video_processor, - **processor_kwargs, - ) - processor.save_pretrained(cls.tmpdirname) + + @classmethod + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") + return tokenizer_class.from_pretrained("OpenGVLab/InternVL3-1B-hf", padding_side="left") + + @classmethod + def _setup_test_attributes(cls, processor): cls.image_token = processor.image_token cls.video_token = processor.video_token + @unittest.skip("InternVL requires text") + def test_video_processor_defaults(self): + pass + @staticmethod def prepare_processor_dict(): return {"image_seq_length": 2} - def get_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer - - def get_image_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor - - def get_video_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor - - def get_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs) - - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname, ignore_errors=True) - # Copied from tests.models.llava.test_processing_llava.LlavaProcessorTest.test_get_num_vision_tokens def test_get_num_vision_tokens(self): "Tests general functionality of the helper used internally in vLLM" diff --git a/tests/models/janus/test_processing_janus.py b/tests/models/janus/test_processing_janus.py index 47efd5c2be6d..0702ff50e546 100644 --- a/tests/models/janus/test_processing_janus.py +++ b/tests/models/janus/test_processing_janus.py @@ -14,45 +14,31 @@ # limitations under the License. """Testing suite for the PyTorch Janus model.""" -import tempfile import unittest import numpy as np -from transformers import AutoProcessor, AutoTokenizer, JanusProcessor +from transformers import JanusProcessor from ...test_processing_common import ProcessorTesterMixin, url_to_local_path class JanusProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = JanusProcessor + model_id = "deepseek-community/Janus-Pro-1B" - def setUp(self): - self.tmpdirname = tempfile.mkdtemp() + @classmethod + def _setup_from_pretrained(cls, model_id, **kwargs): special_image_tokens = { "image_token": "", "boi_token": "", "eoi_token": "", } - - processor = self.processor_class.from_pretrained( - "deepseek-community/Janus-Pro-1B", - extra_special_tokens=special_image_tokens, - **self.prepare_processor_dict(), - ) + processor = super()._setup_from_pretrained(model_id, extra_special_tokens=special_image_tokens) # Set the processor to use the default system prompt to False as it's used based on input modality. # Hence set to False to avoid any issues in the test irrespective of inputs. processor.use_default_system_prompt = False - processor.save_pretrained(self.tmpdirname) - - def get_tokenizer(self, **kwargs): - return AutoTokenizer.from_pretrained(self.tmpdirname, **kwargs) - - def get_image_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor - - def get_processor(self): - return AutoProcessor.from_pretrained(self.tmpdirname) + return processor def test_chat_template_single(self): """ diff --git a/tests/models/kosmos2/test_processing_kosmos2.py b/tests/models/kosmos2/test_processing_kosmos2.py index 56b193eda110..b2d83a25639f 100644 --- a/tests/models/kosmos2/test_processing_kosmos2.py +++ b/tests/models/kosmos2/test_processing_kosmos2.py @@ -13,8 +13,6 @@ # limitations under the License. import os -import shutil -import tempfile import unittest from tempfile import TemporaryDirectory @@ -38,10 +36,8 @@ from PIL import Image from transformers import ( - AutoProcessor, CLIPImageProcessor, Kosmos2Processor, - PreTrainedTokenizerFast, XLMRobertaTokenizer, XLMRobertaTokenizerFast, ) @@ -57,27 +53,20 @@ class Kosmos2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = Kosmos2Processor @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - - image_processor = CLIPImageProcessor(do_center_crop=False) - + def _setup_tokenizer(cls): # We have a SentencePiece fixture for testing slow_tokenizer = XLMRobertaTokenizer(SAMPLE_VOCAB) fast_tokenizer = XLMRobertaTokenizerFast(__slow_tokenizer=slow_tokenizer) - - processor = Kosmos2Processor(image_processor, fast_tokenizer) - processor.save_pretrained(cls.tmpdirname) - - def get_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer - - def get_image_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor + return fast_tokenizer @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname, ignore_errors=True) + def _setup_image_processor(cls): + image_processor_class = cls._get_component_class_from_processor("image_processor") + return image_processor_class(do_center_crop=False) + + @unittest.skip("Kosmos2Processor adds special tokens to the text") + def test_tokenizer_defaults(self): + pass def test_image_processor_load_save_reload(self): # make sure load from Hub repo. -> save -> reload locally work @@ -88,85 +77,6 @@ def test_image_processor_load_save_reload(self): assert image_processor.to_dict() == reloaded_image_processor.to_dict() assert image_processor.to_json_string() == reloaded_image_processor.to_json_string() - def test_save_load_pretrained_additional_features(self): - with tempfile.TemporaryDirectory() as tmpdir: - processor = Kosmos2Processor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()) - processor.save_pretrained(tmpdir) - - tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") - image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0) - - processor = Kosmos2Processor.from_pretrained( - tmpdir, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 - ) - - self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) - self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast) - - self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, CLIPImageProcessor) - - def test_image_processor(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = Kosmos2Processor(tokenizer=tokenizer, image_processor=image_processor) - - image_input = self.prepare_image_inputs() - - input_image_processor = image_processor(image_input, return_tensors="np") - input_processor = processor(images=image_input, return_tensors="np") - - for key in input_image_processor: - self.assertAlmostEqual(input_image_processor[key].sum(), input_processor[key].sum(), delta=1e-2) - - def test_tokenizer(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = Kosmos2Processor(tokenizer=tokenizer, image_processor=image_processor) - - input_str = "This is a test" - - encoded_processor = processor(text=input_str, add_eos_token=True) - - encoded_tok = tokenizer(input_str, return_token_type_ids=False) - - for key in encoded_tok: - self.assertListEqual(encoded_tok[key], encoded_processor[key]) - - def test_processor(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = Kosmos2Processor(tokenizer=tokenizer, image_processor=image_processor) - - input_str = "This is a test" - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input) - - self.assertListEqual( - list(inputs.keys()), ["pixel_values", "input_ids", "attention_mask", "image_embeds_position_mask"] - ) - - # test if it raises when no input is passed - with pytest.raises(ValueError): - processor() - - def test_tokenizer_decode(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = Kosmos2Processor(tokenizer=tokenizer, image_processor=image_processor) - - predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] - - decoded_processor = processor.batch_decode(predicted_ids) - decoded_tok = tokenizer.batch_decode(predicted_ids) - - self.assertListEqual(decoded_tok, decoded_processor) - @require_torch def test_full_processor(self): url = url_to_local_path("https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/two_dogs.jpg") diff --git a/tests/models/kosmos2_5/test_processor_kosmos2_5.py b/tests/models/kosmos2_5/test_processor_kosmos2_5.py index f141afd97d84..64bfdb276efb 100644 --- a/tests/models/kosmos2_5/test_processor_kosmos2_5.py +++ b/tests/models/kosmos2_5/test_processor_kosmos2_5.py @@ -14,8 +14,6 @@ # limitations under the License. import os -import shutil -import tempfile import unittest from tempfile import TemporaryDirectory @@ -40,7 +38,6 @@ AutoTokenizer, Kosmos2_5ImageProcessor, Kosmos2_5Processor, - PreTrainedTokenizerFast, ) @@ -48,22 +45,11 @@ class Kosmos2_5ProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = Kosmos2_5Processor images_input_name = "flattened_patches" + model_id = "microsoft/kosmos-2.5" - def setUp(self): - self.tmpdirname = tempfile.mkdtemp() - image_processor = Kosmos2_5ImageProcessor() - tokenizer = AutoTokenizer.from_pretrained("microsoft/kosmos-2.5") - processor = Kosmos2_5Processor(image_processor, tokenizer) - processor.save_pretrained(self.tmpdirname) - - def get_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer - - def get_image_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor - - def tearDown(self): - shutil.rmtree(self.tmpdirname) + @unittest.skip("Kosmos2_5Processor removes 'rows' and 'cols' from the output") + def test_image_processor_defaults(self): + pass def test_image_procesor_load_save_reload(self): # make sure load from Hub repo. -> save -> reload locally work @@ -74,51 +60,6 @@ def test_image_procesor_load_save_reload(self): assert image_processor.to_dict() == reloaded_image_processor.to_dict() assert image_processor.to_json_string() == reloaded_image_processor.to_json_string() - def test_save_load_pretrained_additional_features(self): - processor = Kosmos2_5Processor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()) - processor.save_pretrained(self.tmpdirname) - - tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") - image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0) - - processor = Kosmos2_5Processor.from_pretrained( - self.tmpdirname, - bos_token="(BOS)", - eos_token="(EOS)", - do_normalize=False, - padding_value=1.0, - ) - - self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) - self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast) - - self.assertEqual( - processor.image_processor.to_json_string(), - image_processor_add_kwargs.to_json_string(), - ) - self.assertIsInstance(processor.image_processor, Kosmos2_5ImageProcessor) - - @unittest.skip(reason="kosmos-2.5 must have both image and text") - def test_image_processor(self): - pass - - @unittest.skip(reason="kosmos-2.5 must have both image and text") - def test_tokenizer(self): - pass - - def test_tokenizer_decode(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = Kosmos2_5Processor(tokenizer=tokenizer, image_processor=image_processor) - - predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] - - decoded_processor = processor.batch_decode(predicted_ids) - decoded_tok = tokenizer.batch_decode(predicted_ids) - - self.assertListEqual(decoded_tok, decoded_processor) - def test_can_load_various_tokenizers(self): for checkpoint in ["microsoft/kosmos-2.5"]: processor = AutoProcessor.from_pretrained(checkpoint) @@ -127,8 +68,8 @@ def test_can_load_various_tokenizers(self): @require_torch def test_model_input_names(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") processor = Kosmos2_5Processor(tokenizer=tokenizer, image_processor=image_processor) diff --git a/tests/models/layoutlmv2/test_processing_layoutlmv2.py b/tests/models/layoutlmv2/test_processing_layoutlmv2.py index 9a116e54c9a7..05f064c1a9ae 100644 --- a/tests/models/layoutlmv2/test_processing_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_processing_layoutlmv2.py @@ -13,20 +13,20 @@ # limitations under the License. import os -import shutil -import tempfile import unittest from functools import cached_property -from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast from transformers.models.layoutlmv2 import LayoutLMv2Processor, LayoutLMv2Tokenizer, LayoutLMv2TokenizerFast from transformers.models.layoutlmv2.tokenization_layoutlmv2 import VOCAB_FILES_NAMES from transformers.testing_utils import require_pytesseract, require_tokenizers, require_torch, slow -from transformers.utils import is_pytesseract_available +from transformers.utils import is_pytesseract_available, is_torchvision_available from ...test_processing_common import ProcessorTesterMixin +if is_torchvision_available(): + from transformers import LayoutLMv2ImageProcessorFast + if is_pytesseract_available(): from transformers import LayoutLMv2ImageProcessor @@ -34,11 +34,19 @@ @require_pytesseract @require_tokenizers class LayoutLMv2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): - tokenizer_class = LayoutLMv2Tokenizer - rust_tokenizer_class = LayoutLMv2TokenizerFast processor_class = LayoutLMv2Processor - def setUp(self): + @classmethod + def _setup_image_processor(cls): + image_processor_class = cls._get_component_class_from_processor("image_processor") + return image_processor_class( + do_resize=True, + size=224, + apply_ocr=True, + ) + + @classmethod + def _setup_tokenizer(cls): vocab_tokens = [ "[UNK]", "[CLS]", @@ -56,59 +64,26 @@ def setUp(self): "low", "lowest", ] - - image_processor_map = { - "do_resize": True, - "size": 224, - "apply_ocr": True, - } - - self.tmpdirname = tempfile.mkdtemp() - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: + vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + return LayoutLMv2Tokenizer.from_pretrained(cls.tmpdirname) - image_processor = LayoutLMv2ImageProcessor(**image_processor_map) - processor = LayoutLMv2Processor(tokenizer=self.get_tokenizer(), image_processor=image_processor) - processor.save_pretrained(self.tmpdirname) - - def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer: - return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) - - def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast: - return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) - - def get_tokenizers(self, **kwargs) -> list[PreTrainedTokenizerBase]: - return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)] + @unittest.skip("LayoutLMv2Processor doesn't use pixel_values") + def test_image_processor_defaults(self): + pass - def get_image_processor(self, **kwargs): - return LayoutLMv2ImageProcessor.from_pretrained(self.tmpdirname, **kwargs) - - def tearDown(self): - shutil.rmtree(self.tmpdirname) - - def test_save_load_pretrained_default(self): - image_processor = self.get_image_processor() - tokenizers = self.get_tokenizers() - for tokenizer in tokenizers: - processor = LayoutLMv2Processor(image_processor=image_processor, tokenizer=tokenizer) - - processor.save_pretrained(self.tmpdirname) - processor = LayoutLMv2Processor.from_pretrained(self.tmpdirname) - - self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab()) - self.assertIsInstance(processor.tokenizer, (LayoutLMv2Tokenizer, LayoutLMv2TokenizerFast)) - - self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string()) - self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor) + @unittest.skip("LayoutLMv2Processor doesn't use pixel_values") + def test_processor_with_multiple_inputs(self): + pass def test_save_load_pretrained_additional_features(self): - processor = LayoutLMv2Processor(image_processor=self.get_image_processor(), tokenizer=self.get_tokenizer()) + processor = self.get_processor() processor.save_pretrained(self.tmpdirname) # slow tokenizer - tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") - image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30) + tokenizer_add_kwargs = self.get_component("tokenizer", bos_token="(BOS)", eos_token="(EOS)") + image_processor_add_kwargs = self.get_component("image_processor", do_resize=False, size=30, use_fast=False) processor = LayoutLMv2Processor.from_pretrained( self.tmpdirname, use_fast=False, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30 @@ -121,8 +96,8 @@ def test_save_load_pretrained_additional_features(self): self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor) # fast tokenizer - tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)") - image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30) + tokenizer_add_kwargs = self.get_component("tokenizer", bos_token="(BOS)", eos_token="(EOS)") + image_processor_add_kwargs = self.get_component("image_processor", do_resize=False, size=30) processor = LayoutLMv2Processor.from_pretrained( self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30 @@ -132,7 +107,7 @@ def test_save_load_pretrained_additional_features(self): self.assertIsInstance(processor.tokenizer, LayoutLMv2TokenizerFast) self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor) + self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessorFast) @slow def test_overflowing_tokens(self): @@ -142,13 +117,13 @@ def test_overflowing_tokens(self): # set up datasets = load_dataset("nielsr/funsd") - processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr") + processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", apply_ocr=False) def preprocess_data(examples): images = [image.convert("RGB") for image in examples["image"]] - words = examples["words"] - boxes = examples["bboxes"] - word_labels = examples["ner_tags"] + words = list(examples["words"]) + boxes = list(examples["bboxes"]) + word_labels = list(examples["ner_tags"]) encoded_inputs = processor( images, words, diff --git a/tests/models/layoutlmv3/test_processing_layoutlmv3.py b/tests/models/layoutlmv3/test_processing_layoutlmv3.py index b7a51a940a5b..9385c55c8f30 100644 --- a/tests/models/layoutlmv3/test_processing_layoutlmv3.py +++ b/tests/models/layoutlmv3/test_processing_layoutlmv3.py @@ -14,12 +14,9 @@ import json import os -import shutil -import tempfile import unittest from functools import cached_property -from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast from transformers.models.layoutlmv3 import LayoutLMv3Processor, LayoutLMv3Tokenizer, LayoutLMv3TokenizerFast from transformers.models.layoutlmv3.tokenization_layoutlmv3 import VOCAB_FILES_NAMES from transformers.testing_utils import require_pytesseract, require_tokenizers, require_torch, slow @@ -35,117 +32,37 @@ @require_pytesseract @require_tokenizers class LayoutLMv3ProcessorTest(ProcessorTesterMixin, unittest.TestCase): - tokenizer_class = LayoutLMv3Tokenizer - rust_tokenizer_class = LayoutLMv3TokenizerFast processor_class = LayoutLMv3Processor - def setUp(self): + @classmethod + def _setup_image_processor(cls): + image_processor_class = cls._get_component_class_from_processor("image_processor") + return image_processor_class( + do_resize=True, + size=224, + apply_ocr=True, + ) + + @classmethod + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer", use_fast=False) # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt - vocab = [ - "l", - "o", - "w", - "e", - "r", - "s", - "t", - "i", - "d", - "n", - "\u0120", - "\u0120l", - "\u0120n", - "\u0120lo", - "\u0120low", - "er", - "\u0120lowest", - "\u0120newer", - "\u0120wider", - "", - ] - self.tmpdirname = tempfile.mkdtemp() + vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "\u0120", "\u0120l", "\u0120n", "\u0120lo", "\u0120low", "er", "\u0120lowest", "\u0120newer", "\u0120wider", ""] # fmt: skip vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] - self.special_tokens_map = {"unk_token": ""} - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as fp: + vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") - with open(self.merges_file, "w", encoding="utf-8") as fp: + with open(merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges)) - image_processor_map = { - "do_resize": True, - "size": 224, - "apply_ocr": True, - } - - image_processor = LayoutLMv3ImageProcessor(**image_processor_map) - processor = LayoutLMv3Processor(tokenizer=self.get_tokenizer(), image_processor=image_processor) - processor.save_pretrained(self.tmpdirname) - - def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer: - return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) - - def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast: - return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) - - def get_tokenizers(self, **kwargs) -> list[PreTrainedTokenizerBase]: - return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)] - - def get_image_processor(self, **kwargs): - return LayoutLMv3ImageProcessor.from_pretrained(self.tmpdirname, **kwargs) - - def tearDown(self): - shutil.rmtree(self.tmpdirname) - - def test_save_load_pretrained_default(self): - image_processor = self.get_image_processor() - tokenizers = self.get_tokenizers() - for tokenizer in tokenizers: - processor = LayoutLMv3Processor(image_processor=image_processor, tokenizer=tokenizer) - - processor.save_pretrained(self.tmpdirname) - processor = LayoutLMv3Processor.from_pretrained(self.tmpdirname) - - self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab()) - self.assertIsInstance(processor.tokenizer, (LayoutLMv3Tokenizer, LayoutLMv3TokenizerFast)) - - self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string()) - self.assertIsInstance(processor.image_processor, LayoutLMv3ImageProcessor) - - def test_save_load_pretrained_additional_features(self): - processor = LayoutLMv3Processor(image_processor=self.get_image_processor(), tokenizer=self.get_tokenizer()) - processor.save_pretrained(self.tmpdirname) - - # slow tokenizer - tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") - image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30) - - processor = LayoutLMv3Processor.from_pretrained( - self.tmpdirname, use_fast=False, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30 - ) - - self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) - self.assertIsInstance(processor.tokenizer, LayoutLMv3Tokenizer) - - self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, LayoutLMv3ImageProcessor) - - # fast tokenizer - tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)") - image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30) - - processor = LayoutLMv3Processor.from_pretrained( - self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30 - ) - - self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) - self.assertIsInstance(processor.tokenizer, LayoutLMv3TokenizerFast) + return tokenizer_class.from_pretrained(cls.tmpdirname, unk_token="") - self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, LayoutLMv3ImageProcessor) + @unittest.skip("LayoutLMv3 Image Processor doesn't return image tensors") + def test_image_processor_defaults(self): + pass # different use cases tests diff --git a/tests/models/layoutxlm/test_processing_layoutxlm.py b/tests/models/layoutxlm/test_processing_layoutxlm.py index effbc9794353..caf591bb6f4a 100644 --- a/tests/models/layoutxlm/test_processing_layoutxlm.py +++ b/tests/models/layoutxlm/test_processing_layoutxlm.py @@ -12,14 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json -import os -import shutil -import tempfile import unittest from functools import cached_property -from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast from transformers.models.layoutxlm import LayoutXLMProcessor, LayoutXLMTokenizer, LayoutXLMTokenizerFast from transformers.testing_utils import ( require_pytesseract, @@ -28,7 +23,7 @@ require_torch, slow, ) -from transformers.utils import FEATURE_EXTRACTOR_NAME, is_pytesseract_available +from transformers.utils import is_pytesseract_available from ...test_processing_common import ProcessorTesterMixin @@ -41,86 +36,46 @@ @require_sentencepiece @require_tokenizers class LayoutXLMProcessorTest(ProcessorTesterMixin, unittest.TestCase): - tokenizer_class = LayoutXLMTokenizer - rust_tokenizer_class = LayoutXLMTokenizerFast processor_class = LayoutXLMProcessor @classmethod - def setUpClass(cls): - image_processor_map = { - "do_resize": True, - "size": 224, - "apply_ocr": True, - } - - cls.tmpdirname = tempfile.mkdtemp() - cls.feature_extraction_file = os.path.join(cls.tmpdirname, FEATURE_EXTRACTOR_NAME) - with open(cls.feature_extraction_file, "w", encoding="utf-8") as fp: - fp.write(json.dumps(image_processor_map) + "\n") - - # taken from `test_tokenization_layoutxlm.LayoutXLMTokenizationTest.test_save_pretrained` - cls.tokenizer_pretrained_name = "hf-internal-testing/tiny-random-layoutxlm" - - tokenizer = cls.get_tokenizer() - tokenizer.save_pretrained(cls.tmpdirname) - image_processor = cls.get_image_processor() - image_processor.save_pretrained(cls.tmpdirname) - processor = LayoutXLMProcessor(tokenizer=tokenizer, image_processor=image_processor) - processor.save_pretrained(cls.tmpdirname) + def _setup_image_processor(cls): + # hardcode as we can't use IMAGE_PROCESSOR_MAPPING to get the class from the config (layoutxlm is not in the mapping) + image_processor_class = LayoutLMv2ImageProcessor + return image_processor_class( + do_resize=True, + size=224, + apply_ocr=True, + ) @classmethod - def get_tokenizer(cls, **kwargs) -> PreTrainedTokenizer: - return cls.tokenizer_class.from_pretrained(cls.tokenizer_pretrained_name, **kwargs) + def _setup_tokenizer(cls): + # hardcode as we can't use TOKENIZER_MAPPING to get the class from the config (layoutxlm is not in the mapping) + tokenizer_class = LayoutXLMTokenizer + return tokenizer_class.from_pretrained("hf-internal-testing/tiny-random-layoutxlm") - @classmethod - def get_rust_tokenizer(cls, **kwargs) -> PreTrainedTokenizerFast: - return cls.rust_tokenizer_class.from_pretrained(cls.tokenizer_pretrained_name, **kwargs) + @unittest.skip("LayoutXLM Image Processor doesn't return image tensors") + def test_image_processor_defaults(self): + pass - @classmethod - def get_tokenizers(cls, **kwargs) -> list[PreTrainedTokenizerBase]: - return [cls.get_tokenizer(**kwargs), cls.get_rust_tokenizer(**kwargs)] - - @classmethod - def get_image_processor(cls, **kwargs): - return LayoutLMv2ImageProcessor.from_pretrained(cls.tmpdirname, **kwargs) - - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname, ignore_errors=True) - - def test_save_load_pretrained_default(self): - image_processor = self.get_image_processor() - tokenizers = self.get_tokenizers() - for tokenizer in tokenizers: - processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer) - - with tempfile.TemporaryDirectory() as tmpdir: - processor.save_pretrained(tmpdir) - processor = LayoutXLMProcessor.from_pretrained(tmpdir) - - self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab()) - self.assertIsInstance(processor.tokenizer, (LayoutXLMTokenizer, LayoutXLMTokenizerFast)) - - self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string()) - self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor) + @unittest.skip("LayoutLMv2Processor doesn't use pixel_values") + def test_processor_with_multiple_inputs(self): + pass def test_save_load_pretrained_additional_features(self): - with tempfile.TemporaryDirectory() as tmpdir: - processor = LayoutXLMProcessor(image_processor=self.get_image_processor(), tokenizer=self.get_tokenizer()) - processor.save_pretrained(tmpdir) - - # slow tokenizer - tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") - image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30) - - processor = LayoutXLMProcessor.from_pretrained( - tmpdir, - use_fast=False, - bos_token="(BOS)", - eos_token="(EOS)", - do_resize=False, - size=30, - ) + processor = self.get_processor() + # slow tokenizer + tokenizer_add_kwargs = self.get_component("tokenizer", bos_token="(BOS)", eos_token="(EOS)") + image_processor_add_kwargs = self.get_component("image_processor", do_resize=False, size=30) + + processor = LayoutXLMProcessor.from_pretrained( + self.tmpdirname, + use_fast=False, + bos_token="(BOS)", + eos_token="(EOS)", + do_resize=False, + size=30, + ) self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) self.assertIsInstance(processor.tokenizer, LayoutXLMTokenizer) @@ -129,8 +84,8 @@ def test_save_load_pretrained_additional_features(self): self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor) # fast tokenizer - tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)") - image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30) + tokenizer_add_kwargs = self.get_component("tokenizer", bos_token="(BOS)", eos_token="(EOS)") + image_processor_add_kwargs = self.get_component("image_processor", do_resize=False, size=30) processor = LayoutXLMProcessor.from_pretrained( self.tmpdirname, use_xlm=True, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30 @@ -154,9 +109,9 @@ def test_overflowing_tokens(self): def preprocess_data(examples): images = [image.convert("RGB") for image in examples["image"]] - words = examples["words"] - boxes = examples["bboxes"] - word_labels = examples["ner_tags"] + words = list(examples["words"]) + boxes = list(examples["bboxes"]) + word_labels = list(examples["ner_tags"]) encoded_inputs = processor( images, words, diff --git a/tests/models/lfm2_vl/test_processing_lfm2_vl.py b/tests/models/lfm2_vl/test_processing_lfm2_vl.py index e087519c8f4c..8810bd5ee064 100755 --- a/tests/models/lfm2_vl/test_processing_lfm2_vl.py +++ b/tests/models/lfm2_vl/test_processing_lfm2_vl.py @@ -13,13 +13,11 @@ # limitations under the License. import math -import shutil -import tempfile import unittest import numpy as np -from transformers import AutoTokenizer, Lfm2VlProcessor +from transformers import Lfm2VlProcessor from transformers.testing_utils import require_torch, require_vision from transformers.utils import is_torchvision_available, is_vision_available @@ -30,7 +28,7 @@ from PIL import Image if is_torchvision_available(): - from transformers import Lfm2VlImageProcessorFast + pass @require_torch @@ -39,26 +37,28 @@ class Lfm2VlProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = Lfm2VlProcessor @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - processor_kwargs = cls.prepare_processor_dict() - image_processor = Lfm2VlImageProcessorFast( + def _setup_image_processor(cls): + image_processor_class = cls._get_component_class_from_processor("image_processor") + return image_processor_class( tile_size=14, min_image_tokens=2, max_image_tokens=10, encoder_patch_size=2, do_image_splitting=False, ) - tokenizer = AutoTokenizer.from_pretrained("LiquidAI/LFM2-VL-1.6B", **processor_kwargs) - processor = Lfm2VlProcessor(tokenizer=tokenizer, image_processor=image_processor, **processor_kwargs) - processor.save_pretrained(cls.tmpdirname) + @classmethod + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") + processor_kwargs = cls.prepare_processor_dict() + return tokenizer_class.from_pretrained("LiquidAI/LFM2-VL-1.6B", **processor_kwargs) + @classmethod + def _setup_test_attributes(cls, processor): # Create images with different sizes cls.small_image = Image.new("RGB", (256, 256)) cls.large_image = Image.new("RGB", (512, 1024)) cls.high_res_image = Image.new("RGB", (1024, 1024)) - cls.bos_token = processor.tokenizer.bos_token cls.image_token = processor.image_token @@ -69,15 +69,6 @@ def setUpClass(cls): cls.padding_token_id = processor.tokenizer.pad_token_id cls.image_thumbnail_token_id = processor.tokenizer.convert_tokens_to_ids(processor.image_thumbnail_token) - def get_tokenizer(self, **kwargs): - return Lfm2VlProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer - - def get_image_processor(self, **kwargs): - return Lfm2VlProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor - - def get_processor(self, **kwargs): - return Lfm2VlProcessor.from_pretrained(self.tmpdirname, **kwargs) - @staticmethod def prepare_processor_dict(): chat_template = ( @@ -102,6 +93,10 @@ def prepare_processor_dict(): ) return {"chat_template": chat_template} + @unittest.skip("Lfm2VlProcessor adds special tokens to the text") + def test_tokenizer_defaults(self): + pass + # Override as Lfm2VL needs images/video to be an explicitly nested batch def prepare_image_inputs(self, batch_size=None): """This function prepares a list of PIL images for testing""" @@ -125,10 +120,6 @@ def get_split_image_expected_tokens(self, processor, image_rows, image_cols, add text_split_images += [self.image_end_token_id] return text_split_images - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname, ignore_errors=True) - def test_process_interleaved_images_prompts_no_image_splitting_single_image(self): processor_components = self.prepare_components() processor_components["tokenizer"] = self.get_component("tokenizer", padding_side="left") diff --git a/tests/models/llama4/test_processing_llama4.py b/tests/models/llama4/test_processing_llama4.py index aef3539a37ea..960ffcaee518 100644 --- a/tests/models/llama4/test_processing_llama4.py +++ b/tests/models/llama4/test_processing_llama4.py @@ -12,42 +12,28 @@ # See the License for the specific language governing permissions and # limitations under the License. -import shutil -import tempfile import unittest -from transformers import AutoProcessor, Llama4Processor, PreTrainedTokenizerFast +from transformers import Llama4Processor from transformers.testing_utils import require_vision -from transformers.utils import is_vision_available from ...test_processing_common import ProcessorTesterMixin -if is_vision_available(): - from transformers import Llama4ImageProcessorFast - - @require_vision class Llama4ProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = Llama4Processor @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - - image_processor = Llama4ImageProcessorFast(max_patches=1, size={"height": 20, "width": 20}) - tokenizer = PreTrainedTokenizerFast.from_pretrained("unsloth/Llama-3.2-11B-Vision-Instruct-unsloth-bnb-4bit") - processor_kwargs = cls.prepare_processor_dict() - processor = Llama4Processor(image_processor, tokenizer, **processor_kwargs) - processor.save_pretrained(cls.tmpdirname) - cls.image_token = processor.image_token + def _setup_image_processor(cls): + image_processor_class = cls._get_component_class_from_processor("image_processor") + return image_processor_class(max_patches=1, size={"height": 20, "width": 20}) - def get_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer - - def get_image_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor + @classmethod + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") + return tokenizer_class.from_pretrained("unsloth/Llama-3.2-11B-Vision-Instruct-unsloth-bnb-4bit") @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname) + def _setup_test_attributes(cls, processor): + cls.image_token = processor.image_token diff --git a/tests/models/llava/test_processing_llava.py b/tests/models/llava/test_processing_llava.py index b9e64cc56a5d..7b263ae7eb86 100644 --- a/tests/models/llava/test_processing_llava.py +++ b/tests/models/llava/test_processing_llava.py @@ -12,46 +12,37 @@ # See the License for the specific language governing permissions and # limitations under the License. import json -import shutil -import tempfile import unittest -from transformers import AutoProcessor, AutoTokenizer, LlamaTokenizerFast, LlavaProcessor +from transformers import AutoTokenizer, LlavaProcessor from transformers.testing_utils import require_vision -from transformers.utils import is_vision_available from ...test_processing_common import ProcessorTesterMixin -if is_vision_available(): - from transformers import CLIPImageProcessor - - @require_vision class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = LlavaProcessor @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() + def _setup_image_processor(cls): + image_processor_class = cls._get_component_class_from_processor("image_processor") + return image_processor_class(do_center_crop=False) - image_processor = CLIPImageProcessor(do_center_crop=False) - tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b") + @classmethod + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") + tokenizer = tokenizer_class.from_pretrained("huggyllama/llama-7b") tokenizer.add_special_tokens({"additional_special_tokens": [""]}) - processor_kwargs = cls.prepare_processor_dict() - processor = LlavaProcessor(image_processor, tokenizer, **processor_kwargs) - processor.save_pretrained(cls.tmpdirname) - cls.image_token = processor.image_token - - def get_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer - - def get_image_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor + if not tokenizer.pad_token: + tokenizer.pad_token = "[PAD]" + if tokenizer.pad_token_id is None: + tokenizer.pad_token_id = 0 + return tokenizer @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname, ignore_errors=True) + def _setup_test_attributes(cls, processor): + cls.image_token = processor.image_token @staticmethod def prepare_processor_dict(): diff --git a/tests/models/llava_next/test_processing_llava_next.py b/tests/models/llava_next/test_processing_llava_next.py index 915ea238b255..5acd8a1b2fa7 100644 --- a/tests/models/llava_next/test_processing_llava_next.py +++ b/tests/models/llava_next/test_processing_llava_next.py @@ -13,50 +13,37 @@ # limitations under the License. import json -import shutil -import tempfile import unittest import torch -from transformers import LlamaTokenizerFast, LlavaNextProcessor +from transformers import LlavaNextProcessor from transformers.testing_utils import ( require_vision, ) -from transformers.utils import is_vision_available from ...test_processing_common import ProcessorTesterMixin -if is_vision_available(): - from transformers import LlavaNextImageProcessor - - @require_vision class LlavaNextProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = LlavaNextProcessor @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - - image_processor = LlavaNextImageProcessor() - tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b") + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") + print("tokenizer_class", tokenizer_class) + tokenizer = tokenizer_class.from_pretrained("huggyllama/llama-7b") tokenizer.add_special_tokens({"additional_special_tokens": [""]}) - processor_kwargs = cls.prepare_processor_dict() - processor = LlavaNextProcessor(image_processor, tokenizer, **processor_kwargs) - processor.save_pretrained(cls.tmpdirname) - cls.image_token = processor.image_token - - def get_tokenizer(self, **kwargs): - return LlavaNextProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer - - def get_image_processor(self, **kwargs): - return LlavaNextProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor + if not tokenizer.pad_token: + tokenizer.pad_token = "[PAD]" + if tokenizer.pad_token_id is None: + tokenizer.pad_token_id = 0 + return tokenizer @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdirname, ignore_errors=True) + def _setup_test_attributes(cls, processor): + cls.image_token = processor.image_token @staticmethod def prepare_processor_dict(): diff --git a/tests/models/llava_next_video/test_processing_llava_next_video.py b/tests/models/llava_next_video/test_processing_llava_next_video.py index bf5d6082059d..592d1c23d77a 100644 --- a/tests/models/llava_next_video/test_processing_llava_next_video.py +++ b/tests/models/llava_next_video/test_processing_llava_next_video.py @@ -13,13 +13,11 @@ # limitations under the License. import json -import shutil -import tempfile import unittest import torch -from transformers import AutoProcessor, LlamaTokenizerFast, LlavaNextVideoProcessor +from transformers import LlavaNextVideoProcessor from transformers.testing_utils import require_vision from transformers.utils import is_torchvision_available, is_vision_available @@ -27,10 +25,8 @@ if is_vision_available(): - from transformers import LlavaNextImageProcessor - if is_torchvision_available(): - from transformers import LlavaNextVideoVideoProcessor + pass @require_vision @@ -38,34 +34,17 @@ class LlavaNextVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = LlavaNextVideoProcessor @classmethod - def setUpClass(cls): - cls.tmpdirname = tempfile.mkdtemp() - image_processor = LlavaNextImageProcessor() - video_processor = LlavaNextVideoVideoProcessor() - tokenizer = LlamaTokenizerFast.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf") + def _setup_tokenizer(cls): + tokenizer_class = cls._get_component_class_from_processor("tokenizer") + tokenizer = tokenizer_class.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf") tokenizer.add_special_tokens({"additional_special_tokens": ["", "