Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions src/transformers/models/auto/tokenization_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,12 @@
("megatron-bert", "BertTokenizer" if is_tokenizers_available() else None),
("metaclip_2", "XLMRobertaTokenizer" if is_tokenizers_available() else None),
("mgp-str", "MgpstrTokenizer"),
(
"ministral",
"MistralCommonBackend"
if is_mistral_common_available()
else ("TokenizersBackend" if is_tokenizers_available() else None),
),
(
"ministral3",
"MistralCommonBackend"
Expand Down Expand Up @@ -331,6 +337,27 @@
]
)

# Models with incorrect tokenizer_class in their Hub tokenizer_config.json files.
# These models will be forced to use TokenizersBackend.
MODELS_WITH_INCORRECT_HUB_TOKENIZER_CLASS: set[str] = {
"arctic",
"deepseek_vl",
"deepseek_vl_hybrid",
"hyperclovax_vlm",
"janus",
"jamba",
"llava",
"llava_next",
Comment on lines +349 to +350
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are these from the vllm ci as well?

"opencua",
"phi3",
"step3p5",
"vipllava",
}

for model_type in MODELS_WITH_INCORRECT_HUB_TOKENIZER_CLASS:
if model_type not in TOKENIZER_MAPPING_NAMES:
TOKENIZER_MAPPING_NAMES[model_type] = "TokenizersBackend" if is_tokenizers_available() else None

TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES)

CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()}
Expand Down