diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py index fbca27b2ff39..ac927b8d2306 100644 --- a/src/transformers/models/align/processing_align.py +++ b/src/transformers/models/align/processing_align.py @@ -59,9 +59,6 @@ class AlignProcessor(ProcessorMixin): """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "EfficientNetImageProcessor" - tokenizer_class = ("BertTokenizer", "BertTokenizerFast") valid_processor_kwargs = AlignProcessorKwargs def __init__(self, image_processor, tokenizer): diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py index 24631ecacbd7..933a5e48dfed 100644 --- a/src/transformers/models/altclip/processing_altclip.py +++ b/src/transformers/models/altclip/processing_altclip.py @@ -35,10 +35,6 @@ class AltCLIPProcessor(ProcessorMixin): The tokenizer is a required input. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = ("CLIPImageProcessor", "CLIPImageProcessorFast") - tokenizer_class = ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast") - @deprecate_kwarg(old_name="feature_extractor", version="5.0.0", new_name="image_processor") def __init__(self, image_processor=None, tokenizer=None): super().__init__(image_processor, tokenizer) diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py index 66483c248a2a..4d471fe40f6a 100644 --- a/src/transformers/models/aria/modular_aria.py +++ b/src/transformers/models/aria/modular_aria.py @@ -906,10 +906,6 @@ class AriaProcessor(ProcessorMixin): A dictionary indicating size conversions for images. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "AriaImageProcessor" - tokenizer_class = "AutoTokenizer" - def __init__( self, image_processor=None, diff --git a/src/transformers/models/aria/processing_aria.py b/src/transformers/models/aria/processing_aria.py index d0841c96aee2..c29c289649da 100644 --- a/src/transformers/models/aria/processing_aria.py +++ b/src/transformers/models/aria/processing_aria.py @@ -67,10 +67,6 @@ class AriaProcessor(ProcessorMixin): A dictionary indicating size conversions for images. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "AriaImageProcessor" - tokenizer_class = "AutoTokenizer" - def __init__( self, image_processor=None, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 7e2e84a445ef..b9a16ae4272e 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -223,6 +223,7 @@ ("layoutlm", "LayoutLMConfig"), ("layoutlmv2", "LayoutLMv2Config"), ("layoutlmv3", "LayoutLMv3Config"), + ("layoutxlm", "LayoutLMv2Config"), ("led", "LEDConfig"), ("levit", "LevitConfig"), ("lfm2", "Lfm2Config"), diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py index 22bc20728aad..7d3b9df512fe 100644 --- a/src/transformers/models/auto/feature_extraction_auto.py +++ b/src/transformers/models/auto/feature_extraction_auto.py @@ -41,6 +41,7 @@ ("audio-spectrogram-transformer", "ASTFeatureExtractor"), ("clap", "ClapFeatureExtractor"), ("clvp", "ClvpFeatureExtractor"), + ("csm", "EncodecFeatureExtractor"), ("dac", "DacFeatureExtractor"), ("data2vec-audio", "Wav2Vec2FeatureExtractor"), ("dia", "DiaFeatureExtractor"), @@ -49,14 +50,20 @@ ("granite_speech", "GraniteSpeechFeatureExtractor"), ("hubert", "Wav2Vec2FeatureExtractor"), ("kyutai_speech_to_text", "KyutaiSpeechToTextFeatureExtractor"), + ("markuplm", "MarkupLMFeatureExtractor"), ("mctct", "MCTCTFeatureExtractor"), ("mimi", "EncodecFeatureExtractor"), ("moonshine", "Wav2Vec2FeatureExtractor"), ("moshi", "EncodecFeatureExtractor"), + ("musicgen", "EncodecFeatureExtractor"), + ("musicgen_melody", "MusicgenMelodyFeatureExtractor"), ("parakeet_ctc", "ParakeetFeatureExtractor"), ("parakeet_encoder", "ParakeetFeatureExtractor"), ("phi4_multimodal", "Phi4MultimodalFeatureExtractor"), ("pop2piano", "Pop2PianoFeatureExtractor"), + ("qwen2_5_omni", "WhisperFeatureExtractor"), + ("qwen2_audio", "WhisperFeatureExtractor"), + ("qwen3_omni_moe", "WhisperFeatureExtractor"), ("seamless_m4t", "SeamlessM4TFeatureExtractor"), ("seamless_m4t_v2", "SeamlessM4TFeatureExtractor"), ("sew", "Wav2Vec2FeatureExtractor"), @@ -66,6 +73,7 @@ ("unispeech", "Wav2Vec2FeatureExtractor"), ("unispeech-sat", "Wav2Vec2FeatureExtractor"), ("univnet", "UnivNetFeatureExtractor"), + ("voxtral", "WhisperFeatureExtractor"), ("wav2vec2", "Wav2Vec2FeatureExtractor"), ("wav2vec2-bert", "Wav2Vec2FeatureExtractor"), ("wav2vec2-conformer", "Wav2Vec2FeatureExtractor"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 0bd8cc850e2c..8ce1c9a59ae8 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -62,7 +62,9 @@ ("aimv2", ("CLIPImageProcessor", "CLIPImageProcessorFast")), ("aimv2_vision_model", ("CLIPImageProcessor", "CLIPImageProcessorFast")), ("align", ("EfficientNetImageProcessor", "EfficientNetImageProcessorFast")), + ("altclip", ("CLIPImageProcessor", "CLIPImageProcessorFast")), ("aria", ("AriaImageProcessor", None)), + ("aya_vision", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")), ("beit", ("BeitImageProcessor", "BeitImageProcessorFast")), ("bit", ("BitImageProcessor", "BitImageProcessorFast")), ("blip", ("BlipImageProcessor", "BlipImageProcessorFast")), @@ -73,6 +75,8 @@ ("clip", ("CLIPImageProcessor", "CLIPImageProcessorFast")), ("clipseg", ("ViTImageProcessor", "ViTImageProcessorFast")), ("cohere2_vision", (None, "Cohere2VisionImageProcessorFast")), + ("colpali", ("SiglipImageProcessor", "SiglipImageProcessorFast")), + ("colqwen2", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")), ("conditional_detr", ("ConditionalDetrImageProcessor", "ConditionalDetrImageProcessorFast")), ("convnext", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")), ("convnextv2", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")), @@ -95,8 +99,10 @@ ("efficientformer", ("EfficientFormerImageProcessor", None)), ("efficientloftr", ("EfficientLoFTRImageProcessor", "EfficientLoFTRImageProcessorFast")), ("efficientnet", ("EfficientNetImageProcessor", "EfficientNetImageProcessorFast")), + ("emu3", ("Emu3ImageProcessor", None)), ("eomt", ("EomtImageProcessor", "EomtImageProcessorFast")), ("flava", ("FlavaImageProcessor", "FlavaImageProcessorFast")), + ("florence2", ("CLIPImageProcessor", "CLIPImageProcessorFast")), ("focalnet", ("BitImageProcessor", "BitImageProcessorFast")), ("fuyu", ("FuyuImageProcessor", "FuyuImageProcessorFast")), ("gemma3", ("Gemma3ImageProcessor", "Gemma3ImageProcessorFast")), @@ -114,11 +120,13 @@ ("ijepa", ("ViTImageProcessor", "ViTImageProcessorFast")), ("imagegpt", ("ImageGPTImageProcessor", "ImageGPTImageProcessorFast")), ("instructblip", ("BlipImageProcessor", "BlipImageProcessorFast")), + ("internvl", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")), ("janus", ("JanusImageProcessor", "JanusImageProcessorFast")), ("kosmos-2", ("CLIPImageProcessor", "CLIPImageProcessorFast")), ("kosmos-2.5", ("Kosmos2_5ImageProcessor", "Kosmos2_5ImageProcessorFast")), ("layoutlmv2", ("LayoutLMv2ImageProcessor", "LayoutLMv2ImageProcessorFast")), ("layoutlmv3", ("LayoutLMv3ImageProcessor", "LayoutLMv3ImageProcessorFast")), + ("layoutxlm", ("LayoutLMv2ImageProcessor", "LayoutLMv2ImageProcessor")), ("levit", ("LevitImageProcessor", "LevitImageProcessorFast")), ("lfm2_vl", (None, "Lfm2VlImageProcessorFast")), ("lightglue", ("LightGlueImageProcessor", "LightGlueImageProcessorFast")), @@ -141,6 +149,7 @@ ("mobilevitv2", ("MobileViTImageProcessor", "MobileViTImageProcessorFast")), ("nat", ("ViTImageProcessor", "ViTImageProcessorFast")), ("nougat", ("NougatImageProcessor", "NougatImageProcessorFast")), + ("omdet-turbo", ("DetrImageProcessor", "DetrImageProcessorFast")), ("oneformer", ("OneFormerImageProcessor", "OneFormerImageProcessorFast")), ("ovis2", ("Ovis2ImageProcessor", "Ovis2ImageProcessorFast")), ("owlv2", ("Owlv2ImageProcessor", "Owlv2ImageProcessorFast")), @@ -155,14 +164,17 @@ ("prompt_depth_anything", ("PromptDepthAnythingImageProcessor", "PromptDepthAnythingImageProcessorFast")), ("pvt", ("PvtImageProcessor", "PvtImageProcessorFast")), ("pvt_v2", ("PvtImageProcessor", "PvtImageProcessorFast")), + ("qwen2_5_omni", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")), ("qwen2_5_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")), ("qwen2_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")), + ("qwen3_omni_moe", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")), ("qwen3_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")), ("regnet", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")), ("resnet", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")), ("rt_detr", ("RTDetrImageProcessor", "RTDetrImageProcessorFast")), ("sam", ("SamImageProcessor", "SamImageProcessorFast")), ("sam2", (None, "Sam2ImageProcessorFast")), + ("sam2_video", (None, "Sam2ImageProcessorFast")), ("sam_hq", ("SamImageProcessor", "SamImageProcessorFast")), ("segformer", ("SegformerImageProcessor", "SegformerImageProcessorFast")), ("seggpt", ("SegGptImageProcessor", None)), @@ -180,12 +192,14 @@ ("textnet", ("TextNetImageProcessor", "TextNetImageProcessorFast")), ("timesformer", ("VideoMAEImageProcessor", None)), ("timm_wrapper", ("TimmWrapperImageProcessor", None)), + ("trocr", ("ViTImageProcessor", "ViTImageProcessorFast")), ("tvlt", ("TvltImageProcessor", None)), ("tvp", ("TvpImageProcessor", "TvpImageProcessorFast")), ("udop", ("LayoutLMv3ImageProcessor", "LayoutLMv3ImageProcessorFast")), ("upernet", ("SegformerImageProcessor", "SegformerImageProcessorFast")), ("van", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")), ("video_llama_3", ("VideoLlama3ImageProcessor", "VideoLlama3ImageProcessorFast")), + ("video_llava", ("VideoLlavaImageProcessor", None)), ("videomae", ("VideoMAEImageProcessor", None)), ("vilt", ("ViltImageProcessor", "ViltImageProcessorFast")), ("vipllava", ("CLIPImageProcessor", "CLIPImageProcessorFast")), diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index 137828d3f6cc..aaea1522a9c0 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -107,6 +107,7 @@ ("mllama", "MllamaProcessor"), ("mm-grounding-dino", "GroundingDinoProcessor"), ("moonshine", "Wav2Vec2Processor"), + ("omdet-turbo", "OmDetTurboProcessor"), ("oneformer", "OneFormerProcessor"), ("ovis2", "Ovis2Processor"), ("owlv2", "Owlv2Processor"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 9b85aeb79135..65a3885a1c46 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -72,6 +72,7 @@ ), ), ("align", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), + ("altclip", ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast" if is_tokenizers_available() else None)), ("arcee", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("aria", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("aya_vision", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)), @@ -156,6 +157,7 @@ ("codegen", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)), ("cohere", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)), ("cohere2", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)), + ("cohere2_vision", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)), ("colpali", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("colqwen2", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)), ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)), @@ -224,6 +226,7 @@ ), ), ("distilbert", ("DistilBertTokenizer", "DistilBertTokenizerFast" if is_tokenizers_available() else None)), + ("donut", ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast" if is_tokenizers_available() else None)), ( "dpr", ( @@ -238,6 +241,7 @@ ("ernie4_5_moe", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("ernie_m", ("ErnieMTokenizer" if is_sentencepiece_available() else None, None)), ("esm", ("EsmTokenizer", None)), + ("evolla", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)), ( "exaone4", ( @@ -252,10 +256,13 @@ ("FastSpeech2ConformerTokenizer" if is_g2p_en_available() else None, None), ), ("flaubert", ("FlaubertTokenizer", None)), + ("flava", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("flex_olmo", (None, "GPT2TokenizerFast" if is_tokenizers_available() else None)), + ("florence2", ("BartTokenizer", "BartTokenizerFast" if is_tokenizers_available() else None)), ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)), ("fsmt", ("FSMTTokenizer", None)), ("funnel", ("FunnelTokenizer", "FunnelTokenizerFast" if is_tokenizers_available() else None)), + ("fuyu", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)), ( "gemma", ( @@ -304,6 +311,7 @@ ("glm4_moe", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), ("glm4v", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), ("glm4v_moe", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), + ("got_ocr2", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), ("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)), ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), ("gpt_bigcode", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), @@ -314,6 +322,7 @@ ("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), ("gptsan-japanese", ("GPTSanJapaneseTokenizer", None)), ("granite", ("GPT2Tokenizer", None)), + ("granite_speech", ("GPT2Tokenizer", None)), ("granitemoe", ("GPT2Tokenizer", None)), ("granitemoehybrid", ("GPT2Tokenizer", None)), ("granitemoeshared", ("GPT2Tokenizer", None)), @@ -353,11 +362,14 @@ ), ), ("kosmos-2.5", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), + ("kyutai_speech_to_text", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)), ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)), ("layoutlmv3", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)), ("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast" if is_tokenizers_available() else None)), ("led", ("LEDTokenizer", "LEDTokenizerFast" if is_tokenizers_available() else None)), + ("lfm2", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), + ("lfm2_vl", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), ("lilt", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)), ( "llama", @@ -398,6 +410,7 @@ ("mamba", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), ("mamba2", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), ("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)), + ("markuplm", ("MarkupLMTokenizer", "MarkupLMTokenizerFast" if is_tokenizers_available() else None)), ( "mbart", ( @@ -484,6 +497,7 @@ "NllbTokenizerFast" if is_tokenizers_available() else None, ), ), + ("nougat", (None, "NougatTokenizerFast" if is_tokenizers_available() else None)), ( "nystromformer", ( @@ -505,6 +519,7 @@ ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None), ), ("opt", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), + ("ovis2", (None, "Qwen2TokenizerFast" if is_tokenizers_available() else None)), ("owlv2", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)), ("owlvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)), ("paligemma", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), @@ -530,6 +545,7 @@ None, ), ), + ("perception_lm", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)), ( "persimmon", ( @@ -539,6 +555,7 @@ ), ("phi", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)), ("phi3", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ("phi4_multimodal", (None, "GPT2TokenizerFast" if is_tokenizers_available() else None)), ("phimoe", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("phobert", ("PhobertTokenizer", None)), ("pix2struct", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)), @@ -552,6 +569,7 @@ ), ), ("plbart", ("PLBartTokenizer" if is_sentencepiece_available() else None, None)), + ("pop2piano", ("Pop2PianoTokenizer", None)), ("prophetnet", ("ProphetNetTokenizer", None)), ("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ( @@ -658,6 +676,7 @@ ), ), ("smollm3", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), + ("smolvlm", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)), ("speech_to_text_2", ("Speech2Text2Tokenizer", None)), ("speecht5", ("SpeechT5Tokenizer" if is_sentencepiece_available() else None, None)), @@ -692,6 +711,7 @@ ("tapas", ("TapasTokenizer", None)), ("tapex", ("TapexTokenizer", None)), ("transfo-xl", ("TransfoXLTokenizer", None)), + ("trocr", ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast" if is_tokenizers_available() else None)), ("tvp", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ( "udop", @@ -707,9 +727,14 @@ "T5TokenizerFast" if is_tokenizers_available() else None, ), ), + ("video_llama_3", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)), ("video_llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("vilt", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("vipllava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ( + "vision_text_dual_encoder", + ("PreTrainedTokenizer", "PreTrainedTokenizerFast" if is_tokenizers_available() else None), + ), ("visual_bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("vits", ("VitsTokenizer", None)), ( @@ -725,6 +750,7 @@ ("wav2vec2-bert", ("Wav2Vec2CTCTokenizer", None)), ("wav2vec2-conformer", ("Wav2Vec2CTCTokenizer", None)), ("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)), + ("wav2vec2_with_lm", ("Wav2Vec2CTCTokenizer", None)), ("whisper", ("WhisperTokenizer", "WhisperTokenizerFast" if is_tokenizers_available() else None)), ("xclip", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)), ( @@ -1160,7 +1186,7 @@ def register(config_class, slow_tokenizer_class=None, fast_tokenizer_class=None, The configuration corresponding to the model to register. slow_tokenizer_class ([`PretrainedTokenizer`], *optional*): The slow tokenizer to register. - fast_tokenizer_class ([`PretrainedTokenizerFast`], *optional*): + fast_tokenizer_class ([`PreTrainedTokenizerFast`], *optional*): The fast tokenizer to register. """ if slow_tokenizer_class is None and fast_tokenizer_class is None: diff --git a/src/transformers/models/auto/video_processing_auto.py b/src/transformers/models/auto/video_processing_auto.py index bcac454b2d65..0b1d7e24d0f9 100644 --- a/src/transformers/models/auto/video_processing_auto.py +++ b/src/transformers/models/auto/video_processing_auto.py @@ -60,6 +60,7 @@ ("qwen3_vl_moe", "Qwen3VLVideoProcessor"), ("sam2_video", "Sam2VideoVideoProcessor"), ("smolvlm", "SmolVLMVideoProcessor"), + ("video_llama_3", "VideoLlama3VideoProcessor"), ("video_llava", "VideoLlavaVideoProcessor"), ("videomae", "VideoMAEVideoProcessor"), ("vjepa2", "VJEPA2VideoProcessor"), diff --git a/src/transformers/models/aya_vision/processing_aya_vision.py b/src/transformers/models/aya_vision/processing_aya_vision.py index 882a85d40946..049b0e5d24eb 100644 --- a/src/transformers/models/aya_vision/processing_aya_vision.py +++ b/src/transformers/models/aya_vision/processing_aya_vision.py @@ -70,10 +70,6 @@ class AyaVisionProcessor(ProcessorMixin): in a chat into a tokenizable string. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "AutoImageProcessor" - tokenizer_class = "AutoTokenizer" - def __init__( self, image_processor=None, diff --git a/src/transformers/models/bark/processing_bark.py b/src/transformers/models/bark/processing_bark.py index b14924f1eeeb..403d107f48f9 100644 --- a/src/transformers/models/bark/processing_bark.py +++ b/src/transformers/models/bark/processing_bark.py @@ -49,9 +49,6 @@ class BarkProcessor(ProcessorMixin): """ - tokenizer_class = "AutoTokenizer" - attributes = ["tokenizer"] - preset_shape = { "semantic_prompt": 1, # 1D array of shape (X,) "coarse_prompt": 2, # 2D array of shape (2,X) diff --git a/src/transformers/models/blip/processing_blip.py b/src/transformers/models/blip/processing_blip.py index f600e8ce27d8..965164206c5a 100644 --- a/src/transformers/models/blip/processing_blip.py +++ b/src/transformers/models/blip/processing_blip.py @@ -53,10 +53,6 @@ class BlipProcessor(ProcessorMixin): An instance of ['BertTokenizerFast`]. The tokenizer is a required input. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast") - tokenizer_class = ("BertTokenizer", "BertTokenizerFast") - def __init__(self, image_processor, tokenizer, **kwargs): tokenizer.return_token_type_ids = False super().__init__(image_processor, tokenizer) diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py index 40729f4f4501..5949e2c648ce 100644 --- a/src/transformers/models/blip_2/processing_blip_2.py +++ b/src/transformers/models/blip_2/processing_blip_2.py @@ -60,10 +60,6 @@ class Blip2Processor(ProcessorMixin): Number of tokens used by the Qformer as queries, should be same as in model's config. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast") - tokenizer_class = "AutoTokenizer" - def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs): tokenizer.return_token_type_ids = False if not hasattr(tokenizer, "image_token"): diff --git a/src/transformers/models/bridgetower/processing_bridgetower.py b/src/transformers/models/bridgetower/processing_bridgetower.py index 030c578c49cd..5de97ec411dc 100644 --- a/src/transformers/models/bridgetower/processing_bridgetower.py +++ b/src/transformers/models/bridgetower/processing_bridgetower.py @@ -54,9 +54,6 @@ class BridgeTowerProcessor(ProcessorMixin): An instance of ['RobertaTokenizerFast`]. The tokenizer is a required input. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "BridgeTowerImageProcessor" - tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast") valid_processor_kwargs = BridgeTowerProcessorKwargs def __init__(self, image_processor, tokenizer): diff --git a/src/transformers/models/bros/processing_bros.py b/src/transformers/models/bros/processing_bros.py index 8de0a1c49b0d..d92b163955a7 100644 --- a/src/transformers/models/bros/processing_bros.py +++ b/src/transformers/models/bros/processing_bros.py @@ -46,8 +46,6 @@ class BrosProcessor(ProcessorMixin): An instance of ['BertTokenizerFast`]. The tokenizer is a required input. """ - attributes = ["tokenizer"] - tokenizer_class = ("BertTokenizer", "BertTokenizerFast") valid_processor_kwargs = BrosProcessorKwargs def __init__(self, tokenizer=None, **kwargs): diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py index 247f72322a2d..694be7ab8f26 100644 --- a/src/transformers/models/chameleon/processing_chameleon.py +++ b/src/transformers/models/chameleon/processing_chameleon.py @@ -69,10 +69,6 @@ class ChameleonProcessor(ProcessorMixin): The special token used to indicate image in the text. """ - attributes = ["image_processor", "tokenizer"] - tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast") - image_processor_class = "ChameleonImageProcessor" - def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, image_token: str = ""): self.image_seq_length = image_seq_length self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token diff --git a/src/transformers/models/chinese_clip/processing_chinese_clip.py b/src/transformers/models/chinese_clip/processing_chinese_clip.py index 0510b9b0f3c9..6508136f772e 100644 --- a/src/transformers/models/chinese_clip/processing_chinese_clip.py +++ b/src/transformers/models/chinese_clip/processing_chinese_clip.py @@ -34,10 +34,6 @@ class ChineseCLIPProcessor(ProcessorMixin): The tokenizer is a required input. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = ("ChineseCLIPImageProcessor", "ChineseCLIPImageProcessorFast") - tokenizer_class = ("BertTokenizer", "BertTokenizerFast") - def __init__(self, image_processor=None, tokenizer=None, **kwargs): super().__init__(image_processor, tokenizer) diff --git a/src/transformers/models/clap/processing_clap.py b/src/transformers/models/clap/processing_clap.py index 6524a8715841..a72151cb9b63 100644 --- a/src/transformers/models/clap/processing_clap.py +++ b/src/transformers/models/clap/processing_clap.py @@ -42,9 +42,6 @@ class ClapProcessor(ProcessorMixin): The tokenizer is a required input. """ - feature_extractor_class = "ClapFeatureExtractor" - tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast") - def __init__(self, feature_extractor, tokenizer): super().__init__(feature_extractor, tokenizer) diff --git a/src/transformers/models/clip/processing_clip.py b/src/transformers/models/clip/processing_clip.py index 7b856f9981ee..9258d2e8fee3 100644 --- a/src/transformers/models/clip/processing_clip.py +++ b/src/transformers/models/clip/processing_clip.py @@ -33,10 +33,6 @@ class CLIPProcessor(ProcessorMixin): The tokenizer is a required input. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = ("CLIPImageProcessor", "CLIPImageProcessorFast") - tokenizer_class = "AutoTokenizer" - def __init__(self, image_processor=None, tokenizer=None, **kwargs): super().__init__(image_processor, tokenizer) diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py index 39e091106c71..4d431181cb4f 100644 --- a/src/transformers/models/clipseg/processing_clipseg.py +++ b/src/transformers/models/clipseg/processing_clipseg.py @@ -34,10 +34,6 @@ class CLIPSegProcessor(ProcessorMixin): The tokenizer is a required input. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = ("ViTImageProcessor", "ViTImageProcessorFast") - tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast") - def __init__(self, image_processor=None, tokenizer=None, **kwargs): super().__init__(image_processor, tokenizer) diff --git a/src/transformers/models/clvp/processing_clvp.py b/src/transformers/models/clvp/processing_clvp.py index 8fad43cd2f30..331589a23999 100644 --- a/src/transformers/models/clvp/processing_clvp.py +++ b/src/transformers/models/clvp/processing_clvp.py @@ -38,9 +38,6 @@ class ClvpProcessor(ProcessorMixin): An instance of [`ClvpTokenizer`]. The tokenizer is a required input. """ - feature_extractor_class = "ClvpFeatureExtractor" - tokenizer_class = "ClvpTokenizer" - def __init__(self, feature_extractor, tokenizer): super().__init__(feature_extractor, tokenizer) diff --git a/src/transformers/models/cohere2_vision/processing_cohere2_vision.py b/src/transformers/models/cohere2_vision/processing_cohere2_vision.py index d4fcec4da875..b34fd1c5594e 100644 --- a/src/transformers/models/cohere2_vision/processing_cohere2_vision.py +++ b/src/transformers/models/cohere2_vision/processing_cohere2_vision.py @@ -47,10 +47,6 @@ class Cohere2VisionProcessor(ProcessorMixin): in a chat into a tokenizable string. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "AutoImageProcessor" - tokenizer_class = "AutoTokenizer" - def __init__( self, image_processor=None, diff --git a/src/transformers/models/colpali/processing_colpali.py b/src/transformers/models/colpali/processing_colpali.py index cd33607a35fd..1ad511ced7a7 100644 --- a/src/transformers/models/colpali/processing_colpali.py +++ b/src/transformers/models/colpali/processing_colpali.py @@ -93,10 +93,6 @@ class ColPaliProcessor(ProcessorMixin): A prefix to be used for the query. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = ("SiglipImageProcessor", "SiglipImageProcessorFast") - tokenizer_class = ("GemmaTokenizer", "GemmaTokenizerFast") - def __init__( self, image_processor=None, diff --git a/src/transformers/models/colqwen2/modular_colqwen2.py b/src/transformers/models/colqwen2/modular_colqwen2.py index 072591abbab8..a96ecc6c7416 100644 --- a/src/transformers/models/colqwen2/modular_colqwen2.py +++ b/src/transformers/models/colqwen2/modular_colqwen2.py @@ -65,9 +65,6 @@ class ColQwen2Processor(ColPaliProcessor): query_prefix (`str`, *optional*): A prefix to be used for the query. """ - image_processor_class = "AutoImageProcessor" - tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast") - def __init__( self, image_processor=None, diff --git a/src/transformers/models/colqwen2/processing_colqwen2.py b/src/transformers/models/colqwen2/processing_colqwen2.py index 2eb9fed873a8..00f00c920856 100644 --- a/src/transformers/models/colqwen2/processing_colqwen2.py +++ b/src/transformers/models/colqwen2/processing_colqwen2.py @@ -64,11 +64,6 @@ class ColQwen2Processor(ProcessorMixin): query_prefix (`str`, *optional*): A prefix to be used for the query. """ - attributes = ["image_processor", "tokenizer"] - - image_processor_class = "AutoImageProcessor" - tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast") - def __init__( self, image_processor=None, diff --git a/src/transformers/models/csm/processing_csm.py b/src/transformers/models/csm/processing_csm.py index 172016f6431d..d77ffeffd896 100644 --- a/src/transformers/models/csm/processing_csm.py +++ b/src/transformers/models/csm/processing_csm.py @@ -95,10 +95,6 @@ class CsmProcessor(ProcessorMixin): """ - attributes = ["feature_extractor", "tokenizer"] - feature_extractor_class = "EncodecFeatureExtractor" - tokenizer_class = "PreTrainedTokenizerFast" - def __init__( self, feature_extractor, diff --git a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py index 9b894b7f7505..21cb19d79c3b 100644 --- a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py +++ b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py @@ -222,11 +222,6 @@ class DeepseekVLProcessor(ProcessorMixin): The number of special image tokens used as placeholders for visual content in text sequences. """ - attributes = ["image_processor", "tokenizer"] - valid_kwargs = ["chat_template", "num_image_tokens"] - image_processor_class = "AutoImageProcessor" - tokenizer_class = "AutoTokenizer" - def __init__( self, image_processor, diff --git a/src/transformers/models/deepseek_vl/processing_deepseek_vl.py b/src/transformers/models/deepseek_vl/processing_deepseek_vl.py index ddeb4f799ee1..22b1c2ab71dd 100644 --- a/src/transformers/models/deepseek_vl/processing_deepseek_vl.py +++ b/src/transformers/models/deepseek_vl/processing_deepseek_vl.py @@ -52,11 +52,6 @@ class DeepseekVLProcessor(ProcessorMixin): The number of special image tokens used as placeholders for visual content in text sequences. """ - attributes = ["image_processor", "tokenizer"] - valid_kwargs = ["chat_template", "num_image_tokens"] - image_processor_class = "AutoImageProcessor" - tokenizer_class = "AutoTokenizer" - def __init__( self, image_processor, diff --git a/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py index d20fa495f9b8..8f842db7346f 100644 --- a/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py @@ -52,11 +52,6 @@ class DeepseekVLHybridProcessor(ProcessorMixin): The number of special image tokens used as placeholders for visual content in text sequences. """ - attributes = ["image_processor", "tokenizer"] - valid_kwargs = ["chat_template", "num_image_tokens"] - image_processor_class = "AutoImageProcessor" - tokenizer_class = "AutoTokenizer" - def __init__( self, image_processor, diff --git a/src/transformers/models/dia/processing_dia.py b/src/transformers/models/dia/processing_dia.py index 6518b5444639..23c04687308c 100644 --- a/src/transformers/models/dia/processing_dia.py +++ b/src/transformers/models/dia/processing_dia.py @@ -77,8 +77,6 @@ class DiaProcessor(ProcessorMixin): An instance of [`DacModel`] used to encode/decode audio into/from codebooks. It is is a required input. """ - feature_extractor_class = "DiaFeatureExtractor" - tokenizer_class = "DiaTokenizer" audio_tokenizer_class = "DacModel" def __init__(self, feature_extractor, tokenizer, audio_tokenizer): diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py index 65ca58bcf781..fedd173117eb 100644 --- a/src/transformers/models/donut/processing_donut.py +++ b/src/transformers/models/donut/processing_donut.py @@ -48,10 +48,6 @@ class DonutProcessor(ProcessorMixin): An instance of [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. The tokenizer is a required input. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "AutoImageProcessor" - tokenizer_class = "AutoTokenizer" - def __init__(self, image_processor=None, tokenizer=None, **kwargs): super().__init__(image_processor, tokenizer) diff --git a/src/transformers/models/emu3/processing_emu3.py b/src/transformers/models/emu3/processing_emu3.py index b7ed8e9074f0..52f39a913c54 100644 --- a/src/transformers/models/emu3/processing_emu3.py +++ b/src/transformers/models/emu3/processing_emu3.py @@ -64,10 +64,6 @@ class Emu3Processor(ProcessorMixin): in a chat into a tokenizable string. """ - attributes = ["image_processor", "tokenizer"] - tokenizer_class = ("GPT2Tokenizer", "GPT2TokenizerFast") - image_processor_class = "Emu3ImageProcessor" - def __init__( self, image_processor, diff --git a/src/transformers/models/evolla/processing_evolla.py b/src/transformers/models/evolla/processing_evolla.py index 3be0e07364a6..807bd294c406 100644 --- a/src/transformers/models/evolla/processing_evolla.py +++ b/src/transformers/models/evolla/processing_evolla.py @@ -16,14 +16,12 @@ Processor class for EVOLLA. """ -import os from typing import Optional, Union from ...feature_extraction_utils import BatchFeature from ...processing_utils import ( ProcessorMixin, ) -from ..auto import AutoTokenizer PROTEIN_VALID_KEYS = ["aa_seq", "foldseek", "msa"] @@ -47,15 +45,6 @@ class EvollaProcessor(ProcessorMixin): The maximum length of the text to be generated. """ - attributes = ["protein_tokenizer", "tokenizer"] - valid_kwargs = ["sequence_max_length"] - # protein_tokenizer_class = "EsmTokenizer" - # tokenizer_class = "LlamaTokenizerFast" - protein_tokenizer_class = "AutoTokenizer" - tokenizer_class = "AutoTokenizer" - protein_tokenizer_dir_name = "protein_tokenizer" - # tokenizer_dir_name = "text_tokenizer" - def __init__(self, protein_tokenizer, tokenizer=None, protein_max_length=1024, text_max_length=512, **kwargs): if protein_tokenizer is None: raise ValueError("You need to specify an `protein_tokenizer`.") @@ -206,42 +195,5 @@ def protein_batch_decode(self, *args, **kwargs): def protein_decode(self, *args, **kwargs): return self.protein_tokenizer.decode(*args, **kwargs) - # overwrite to save the protein tokenizer in a separate folder - # Adapted from instructblip.processing_instructblip.py (https://github.com/huggingface/transformers/blob/9b479a245b793cac2a8b2e87c6d8e81bb24e20c4/src/transformers/models/instructblip/processing_instructblip.py#L191-L221) - def save_pretrained(self, save_directory, **kwargs): - # only save the protein tokenizer in sub_dir - self.protein_tokenizer.save_pretrained(os.path.join(save_directory, self.protein_tokenizer_dir_name)) - - # we modify the attributes so that only the text tokenizer are saved in the main folder - protein_tokenizer_present = "protein_tokenizer" in self.attributes - # find the correct position of it in the attributes list - protein_tokenizer_index = self.attributes.index("protein_tokenizer") if protein_tokenizer_present else None - if protein_tokenizer_present and protein_tokenizer_index is not None: - self.attributes.remove("protein_tokenizer") - - outputs = super().save_pretrained(save_directory, **kwargs) - - if protein_tokenizer_present and protein_tokenizer_index is not None: - self.attributes.insert(protein_tokenizer_index, "protein_tokenizer") - - return outputs - - # overwrite to load the protein tokenizer from a separate folder - # Adapted from instructblip.processing_instructblip.py (https://github.com/huggingface/transformers/blob/9b479a245b793cac2a8b2e87c6d8e81bb24e20c4/src/transformers/models/instructblip/processing_instructblip.py#L191-L221) - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): - processor = super().from_pretrained(pretrained_model_name_or_path, **kwargs) - - # if return_unused_kwargs a tuple is returned where the second element is 'unused_kwargs' - if isinstance(processor, tuple): - processor = processor[0] - protein_tokenizer = AutoTokenizer.from_pretrained( - pretrained_model_name_or_path, subfolder=cls.protein_tokenizer_dir_name - ) - - processor.protein_tokenizer = protein_tokenizer - - return processor - __all__ = ["EvollaProcessor"] diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py index 272fb01d7b7a..7e5b3c0e012e 100644 --- a/src/transformers/models/flava/processing_flava.py +++ b/src/transformers/models/flava/processing_flava.py @@ -31,10 +31,6 @@ class FlavaProcessor(ProcessorMixin): tokenizer ([`BertTokenizerFast`], *optional*): The tokenizer is a required input. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "FlavaImageProcessor" - tokenizer_class = ("BertTokenizer", "BertTokenizerFast") - def __init__(self, image_processor=None, tokenizer=None, **kwargs): super().__init__(image_processor, tokenizer) diff --git a/src/transformers/models/florence2/modular_florence2.py b/src/transformers/models/florence2/modular_florence2.py index 6ae43c0b69a7..049beffce14b 100644 --- a/src/transformers/models/florence2/modular_florence2.py +++ b/src/transformers/models/florence2/modular_florence2.py @@ -256,10 +256,6 @@ class Florence2Processor(ProcessorMixin): thresholds, or banned tokens. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "AutoImageProcessor" - tokenizer_class = ("BartTokenizer", "BartTokenizerFast") - def __init__( self, image_processor=None, diff --git a/src/transformers/models/florence2/processing_florence2.py b/src/transformers/models/florence2/processing_florence2.py index 1c25ddceeafc..c8d699e4bc3e 100644 --- a/src/transformers/models/florence2/processing_florence2.py +++ b/src/transformers/models/florence2/processing_florence2.py @@ -62,10 +62,6 @@ class Florence2Processor(ProcessorMixin): thresholds, or banned tokens. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "AutoImageProcessor" - tokenizer_class = ("BartTokenizer", "BartTokenizerFast") - def __init__( self, image_processor=None, diff --git a/src/transformers/models/fuyu/processing_fuyu.py b/src/transformers/models/fuyu/processing_fuyu.py index a715ce412313..ee697deccf9e 100644 --- a/src/transformers/models/fuyu/processing_fuyu.py +++ b/src/transformers/models/fuyu/processing_fuyu.py @@ -347,10 +347,6 @@ class FuyuProcessor(ProcessorMixin): The tokenizer is a required input. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "FuyuImageProcessor" - tokenizer_class = "AutoTokenizer" - def __init__(self, image_processor, tokenizer, **kwargs): super().__init__(image_processor=image_processor, tokenizer=tokenizer) self.image_processor = image_processor diff --git a/src/transformers/models/gemma3/processing_gemma3.py b/src/transformers/models/gemma3/processing_gemma3.py index a9bac5b69e47..11574e30b7c1 100644 --- a/src/transformers/models/gemma3/processing_gemma3.py +++ b/src/transformers/models/gemma3/processing_gemma3.py @@ -42,10 +42,6 @@ class Gemma3ProcessorKwargs(ProcessingKwargs, total=False): class Gemma3Processor(ProcessorMixin): - attributes = ["image_processor", "tokenizer"] - image_processor_class = "AutoImageProcessor" - tokenizer_class = "AutoTokenizer" - def __init__( self, image_processor, diff --git a/src/transformers/models/gemma3n/processing_gemma3n.py b/src/transformers/models/gemma3n/processing_gemma3n.py index 913336b8d3f5..51b686557ed0 100644 --- a/src/transformers/models/gemma3n/processing_gemma3n.py +++ b/src/transformers/models/gemma3n/processing_gemma3n.py @@ -51,11 +51,6 @@ class Gemma3nProcessor(ProcessorMixin): The number of image soft tokens that should be added to """ - attributes = ["feature_extractor", "image_processor", "tokenizer"] - feature_extractor_class = "AutoFeatureExtractor" - image_processor_class = "AutoImageProcessor" - tokenizer_class = "AutoTokenizer" - def __init__( self, feature_extractor, diff --git a/src/transformers/models/git/processing_git.py b/src/transformers/models/git/processing_git.py index 2eba7c68f584..89cfc9618987 100644 --- a/src/transformers/models/git/processing_git.py +++ b/src/transformers/models/git/processing_git.py @@ -33,10 +33,6 @@ class GitProcessor(ProcessorMixin): The tokenizer is a required input. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "AutoImageProcessor" - tokenizer_class = "AutoTokenizer" - def __init__(self, image_processor, tokenizer): super().__init__(image_processor, tokenizer) diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py index 8ae513b63d44..9ff12163b12a 100644 --- a/src/transformers/models/glm4v/modular_glm4v.py +++ b/src/transformers/models/glm4v/modular_glm4v.py @@ -1545,8 +1545,6 @@ class Glm4vProcessor(Qwen2VLProcessor): in a chat into a tokenizable string. """ - tokenizer_class = ("PreTrainedTokenizer", "PreTrainedTokenizerFast") - def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs): super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template) self.image_token = "<|image|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token diff --git a/src/transformers/models/glm4v/processing_glm4v.py b/src/transformers/models/glm4v/processing_glm4v.py index e8f9c948c66d..79935cbde7b4 100644 --- a/src/transformers/models/glm4v/processing_glm4v.py +++ b/src/transformers/models/glm4v/processing_glm4v.py @@ -59,12 +59,6 @@ class Glm4vProcessor(ProcessorMixin): in a chat into a tokenizable string. """ - attributes = ["image_processor", "tokenizer", "video_processor"] - image_processor_class = "AutoImageProcessor" - video_processor_class = "AutoVideoProcessor" - - tokenizer_class = ("PreTrainedTokenizer", "PreTrainedTokenizerFast") - def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs): self.image_token = "<|image|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token self.video_token = "<|video|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token diff --git a/src/transformers/models/got_ocr2/processing_got_ocr2.py b/src/transformers/models/got_ocr2/processing_got_ocr2.py index 1843b7f28830..162efef5e9f9 100644 --- a/src/transformers/models/got_ocr2/processing_got_ocr2.py +++ b/src/transformers/models/got_ocr2/processing_got_ocr2.py @@ -93,10 +93,6 @@ class GotOcr2Processor(ProcessorMixin): in a chat into a tokenizable string. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "AutoImageProcessor" - tokenizer_class = "PreTrainedTokenizerFast" - def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): super().__init__(image_processor, tokenizer, chat_template=chat_template) diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py index 0b76ccfe75db..910840bd661c 100644 --- a/src/transformers/models/granite_speech/processing_granite_speech.py +++ b/src/transformers/models/granite_speech/processing_granite_speech.py @@ -30,10 +30,6 @@ class GraniteSpeechProcessor(ProcessorMixin): - attributes = ["audio_processor", "tokenizer"] - audio_processor_class = "GraniteSpeechFeatureExtractor" - tokenizer_class = "AutoTokenizer" - def __init__( self, audio_processor, diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index 5f2f900451b2..74565588d852 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -130,9 +130,6 @@ class GroundingDinoProcessor(ProcessorMixin): An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "GroundingDinoImageProcessor" - tokenizer_class = "AutoTokenizer" valid_processor_kwargs = GroundingDinoProcessorKwargs def __init__(self, image_processor, tokenizer): diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py index b0ad20df386b..7cb640e56854 100644 --- a/src/transformers/models/idefics/processing_idefics.py +++ b/src/transformers/models/idefics/processing_idefics.py @@ -153,10 +153,6 @@ class IdeficsProcessor(ProcessorMixin): The string representation of token representing end of utterance """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "IdeficsImageProcessor" - tokenizer_class = "LlamaTokenizerFast" - def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_utterance_token=None, **kwargs): super().__init__(image_processor, tokenizer) self.image_token_id = ( diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py index c419a3641254..df5f9ca73a8b 100644 --- a/src/transformers/models/idefics2/processing_idefics2.py +++ b/src/transformers/models/idefics2/processing_idefics2.py @@ -75,10 +75,6 @@ class Idefics2Processor(ProcessorMixin): in a chat into a tokenizable string. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "Idefics2ImageProcessor" - tokenizer_class = "AutoTokenizer" - def __init__( self, image_processor, tokenizer=None, image_seq_len: int = 64, chat_template: Optional[str] = None, **kwargs ): diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py index 143805ef6f43..5c978eb3b230 100644 --- a/src/transformers/models/idefics3/processing_idefics3.py +++ b/src/transformers/models/idefics3/processing_idefics3.py @@ -121,10 +121,6 @@ class Idefics3Processor(ProcessorMixin): in a chat into a tokenizable string. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "Idefics3ImageProcessor" - tokenizer_class = "AutoTokenizer" - def __init__( self, image_processor, tokenizer=None, image_seq_len: int = 169, chat_template: Optional[str] = None, **kwargs ): diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py index afe43c1fc7a7..cfed52f745ae 100644 --- a/src/transformers/models/instructblip/processing_instructblip.py +++ b/src/transformers/models/instructblip/processing_instructblip.py @@ -16,7 +16,6 @@ Processor class for InstructBLIP. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former. """ -import os from typing import Optional, Union from ...image_processing_utils import BatchFeature @@ -24,7 +23,6 @@ from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import AddedToken, PreTokenizedInput, TextInput from ...utils import logging -from ..auto import AutoTokenizer logger = logging.get_logger(__name__) @@ -65,11 +63,6 @@ class InstructBlipProcessor(ProcessorMixin): Number of tokens used by the Qformer as queries, should be same as in model's config. """ - attributes = ["image_processor", "tokenizer", "qformer_tokenizer"] - image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast") - tokenizer_class = "AutoTokenizer" - qformer_tokenizer_class = "AutoTokenizer" - def __init__(self, image_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs): if not hasattr(tokenizer, "image_token"): self.image_token = AddedToken("", normalized=False, special=True) @@ -152,36 +145,5 @@ def model_input_names(self): qformer_input_names = ["qformer_input_ids", "qformer_attention_mask"] return tokenizer_input_names + image_processor_input_names + qformer_input_names - # overwrite to save the Q-Former tokenizer in a separate folder - def save_pretrained(self, save_directory, **kwargs): - if os.path.isfile(save_directory): - raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file") - os.makedirs(save_directory, exist_ok=True) - qformer_tokenizer_path = os.path.join(save_directory, "qformer_tokenizer") - self.qformer_tokenizer.save_pretrained(qformer_tokenizer_path) - - # We modify the attributes so that only the tokenizer and image processor are saved in the main folder - qformer_present = "qformer_tokenizer" in self.attributes - if qformer_present: - self.attributes.remove("qformer_tokenizer") - - outputs = super().save_pretrained(save_directory, **kwargs) - - if qformer_present: - self.attributes += ["qformer_tokenizer"] - return outputs - - # overwrite to load the Q-Former tokenizer from a separate folder - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): - processor = super().from_pretrained(pretrained_model_name_or_path, **kwargs) - - # if return_unused_kwargs a tuple is returned where the second element is 'unused_kwargs' - if isinstance(processor, tuple): - processor = processor[0] - qformer_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="qformer_tokenizer") - processor.qformer_tokenizer = qformer_tokenizer - return processor - __all__ = ["InstructBlipProcessor"] diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py index f813faba7b89..81d0103b2742 100644 --- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py @@ -16,7 +16,6 @@ Processor class for InstructBLIP. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former. """ -import os from typing import Optional, Union from ...image_processing_utils import BatchFeature @@ -30,7 +29,6 @@ ) from ...utils import TensorType, logging from ...video_utils import VideoInput -from ..auto import AutoTokenizer logger = logging.get_logger(__name__) @@ -55,11 +53,6 @@ class InstructBlipVideoProcessor(ProcessorMixin): Number of tokens used by the Qformer as queries, should be same as in model's config. """ - attributes = ["video_processor", "tokenizer", "qformer_tokenizer"] - video_processor_class = "AutoVideoProcessor" - tokenizer_class = "AutoTokenizer" - qformer_tokenizer_class = "AutoTokenizer" - def __init__(self, video_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs): if not hasattr(tokenizer, "video_token"): self.video_token = AddedToken("