huggingface · ArthurZucker · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
@@ -32,7 +32,6 @@
 from .generation.configuration_utils import GenerationConfig
 from .modeling_gguf_pytorch_utils import load_gguf_checkpoint
 from .modeling_rope_utils import RotaryEmbeddingConfigMixin
-from .tokenization_utils_base import PreTrainedTokenizerBase
 from .utils import (
     CONFIG_NAME,
     PushToHubMixin,
@@ -234,9 +233,6 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
     label2id: dict[str, int] | dict[str, str] | None = None
     problem_type: Literal["regression", "single_label_classification", "multi_label_classification"] | None = None
 
-    # Tokenizer kwargs
-    tokenizer_class: str | PreTrainedTokenizerBase | None = None
-
     def __post_init__(self, **kwargs):
         # BC for the `torch_dtype` argument instead of the simpler `dtype`
         # Do not warn, as it would otherwise always be triggered since most configs on the hub have `torch_dtype`

diff --git a/src/transformers/models/mt5/configuration_mt5.py b/src/transformers/models/mt5/configuration_mt5.py
@@ -29,8 +29,6 @@ class MT5Config(PreTrainedConfig):
         The maximum distance of the longer sequences for the bucket separation.
     feed_forward_proj (`str`, *optional*, defaults to `"gated-gelu"`):
         Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`.
-    tokenizer_class (`str`, *optional*, defaults to `"T5Tokenizer"`):
-        The tokenizer's class name.
     """
 
     model_type = "mt5"
@@ -57,7 +55,6 @@ class MT5Config(PreTrainedConfig):
     feed_forward_proj: str = "gated-gelu"
     is_encoder_decoder: bool = True
     use_cache: bool = True
-    tokenizer_class: str = "T5Tokenizer"
     tie_word_embeddings: bool = True
     bos_token_id: int | None = None
     pad_token_id: int | None = 0

diff --git a/src/transformers/models/umt5/configuration_umt5.py b/src/transformers/models/umt5/configuration_umt5.py
@@ -29,8 +29,6 @@ class UMT5Config(PreTrainedConfig):
         The maximum distance of the longer sequences for the bucket separation.
     feed_forward_proj (`str`, *optional*, defaults to `"gated-gelu"`):
         Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`.
-    tokenizer_class (`str`, *optional*, defaults to `"T5Tokenizer"`):
-        The tokenizer's class name
     """
 
     model_type = "umt5"
@@ -57,7 +55,6 @@ class UMT5Config(PreTrainedConfig):
     feed_forward_proj: str = "gated-gelu"
     is_encoder_decoder: bool = True
     use_cache: bool = True
-    tokenizer_class: str = "T5Tokenizer"
     pad_token_id: int | None = 0
     eos_token_id: int | list[int] | None = 1
     decoder_start_token_id: int | None = 0