diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 6b6ff939a50c..c39c93602892 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -185,6 +185,12 @@ ("megatron-bert", "BertTokenizer" if is_tokenizers_available() else None), ("metaclip_2", "XLMRobertaTokenizer" if is_tokenizers_available() else None), ("mgp-str", "MgpstrTokenizer"), + ( + "ministral", + "MistralCommonBackend" + if is_mistral_common_available() + else ("TokenizersBackend" if is_tokenizers_available() else None), + ), ( "ministral3", "MistralCommonBackend" @@ -331,6 +337,27 @@ ] ) +# Models with incorrect tokenizer_class in their Hub tokenizer_config.json files. +# These models will be forced to use TokenizersBackend. +MODELS_WITH_INCORRECT_HUB_TOKENIZER_CLASS: set[str] = { + "arctic", + "deepseek_vl", + "deepseek_vl_hybrid", + "hyperclovax_vlm", + "janus", + "jamba", + "llava", + "llava_next", + "opencua", + "phi3", + "step3p5", + "vipllava", +} + +for model_type in MODELS_WITH_INCORRECT_HUB_TOKENIZER_CLASS: + if model_type not in TOKENIZER_MAPPING_NAMES: + TOKENIZER_MAPPING_NAMES[model_type] = "TokenizersBackend" if is_tokenizers_available() else None + TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES) CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()}