From 666564599baaf0cf809e89412298ecff91f99c71 Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 19 Mar 2026 10:51:21 +0100 Subject: [PATCH 1/2] update eos and q-lora-ranl --- src/transformers/configuration_utils.py | 18 +++++++++++++++++- .../blenderbot/configuration_blenderbot.py | 2 +- .../configuration_blenderbot_small.py | 2 +- .../models/clvp/configuration_clvp.py | 2 +- .../deepseek_v3/configuration_deepseek_v3.py | 2 +- .../models/dia/configuration_dia.py | 2 +- .../models/eurobert/configuration_eurobert.py | 2 +- .../models/eurobert/modular_eurobert.py | 2 +- .../exaone_moe/configuration_exaone_moe.py | 2 +- .../models/exaone_moe/modular_exaone_moe.py | 2 +- .../models/flaubert/configuration_flaubert.py | 2 +- .../models/fnet/configuration_fnet.py | 2 +- .../models/fsmt/configuration_fsmt.py | 4 ++-- .../models/git/configuration_git.py | 2 +- .../configuration_glm4_moe_lite.py | 2 +- .../glm4_moe_lite/modular_glm4_moe_lite.py | 2 +- .../models/gpt2/configuration_gpt2.py | 2 +- .../gpt_bigcode/configuration_gpt_bigcode.py | 2 +- .../models/gpt_neo/configuration_gpt_neo.py | 2 +- .../models/gptj/configuration_gptj.py | 2 +- .../models/groupvit/configuration_groupvit.py | 2 +- .../configuration_higgs_audio_v2.py | 2 +- .../higgs_audio_v2/modular_higgs_audio_v2.py | 2 +- .../models/hubert/configuration_hubert.py | 2 +- .../models/ibert/configuration_ibert.py | 2 +- .../models/idefics/configuration_idefics.py | 2 +- .../models/jamba/configuration_jamba.py | 2 +- .../configuration_jina_embeddings_v3.py | 2 +- .../models/kosmos2/configuration_kosmos2.py | 2 +- .../kosmos2_5/configuration_kosmos2_5.py | 2 +- .../layoutlmv3/configuration_layoutlmv3.py | 2 +- .../models/led/configuration_led.py | 2 +- .../models/llama4/configuration_llama4.py | 2 +- .../configuration_longcat_flash.py | 2 +- .../models/longt5/configuration_longt5.py | 2 +- .../models/luke/configuration_luke.py | 2 +- .../models/m2m_100/configuration_m2m_100.py | 2 +- .../models/mamba2/configuration_mamba2.py | 2 +- .../models/marian/configuration_marian.py | 4 ++-- .../models/markuplm/configuration_markuplm.py | 2 +- .../models/mbart/configuration_mbart.py | 4 ++-- .../models/mistral4/configuration_mistral4.py | 4 ++-- .../models/mpnet/configuration_mpnet.py | 2 +- .../models/mra/configuration_mra.py | 2 +- .../models/mt5/configuration_mt5.py | 2 +- .../models/mvp/configuration_mvp.py | 2 +- .../nemotron_h/configuration_nemotron_h.py | 2 +- .../models/nllb_moe/configuration_nllb_moe.py | 2 +- .../configuration_nystromformer.py | 2 +- .../olmo_hybrid/configuration_olmo_hybrid.py | 2 +- .../models/olmo_hybrid/modular_olmo_hybrid.py | 2 +- .../models/opt/configuration_opt.py | 2 +- .../models/owlv2/configuration_owlv2.py | 2 +- .../models/owlvit/configuration_owlvit.py | 2 +- .../models/pegasus/configuration_pegasus.py | 4 ++-- .../pegasus_x/configuration_pegasus_x.py | 4 ++-- .../pix2struct/configuration_pix2struct.py | 2 +- .../models/plbart/configuration_plbart.py | 4 ++-- .../pop2piano/configuration_pop2piano.py | 2 +- .../models/reformer/configuration_reformer.py | 2 +- .../models/rembert/configuration_rembert.py | 2 +- .../models/roberta/configuration_roberta.py | 2 +- .../configuration_roberta_prelayernorm.py | 2 +- .../models/rwkv/configuration_rwkv.py | 2 +- .../seamless_m4t/configuration_seamless_m4t.py | 4 ++-- .../configuration_seamless_m4t_v2.py | 4 ++-- .../models/sew/configuration_sew.py | 2 +- .../models/sew_d/configuration_sew_d.py | 2 +- .../configuration_speech_to_text.py | 2 +- .../models/speecht5/configuration_speecht5.py | 2 +- .../configuration_switch_transformers.py | 2 +- src/transformers/models/t5/configuration_t5.py | 2 +- .../models/trocr/configuration_trocr.py | 2 +- .../models/udop/configuration_udop.py | 2 +- .../models/umt5/configuration_umt5.py | 2 +- .../unispeech/configuration_unispeech.py | 2 +- .../visual_bert/configuration_visual_bert.py | 2 +- .../models/wav2vec2/configuration_wav2vec2.py | 2 +- .../configuration_wav2vec2_bert.py | 2 +- .../configuration_wav2vec2_conformer.py | 2 +- .../models/wavlm/configuration_wavlm.py | 2 +- .../models/whisper/configuration_whisper.py | 2 +- .../models/x_clip/configuration_x_clip.py | 2 +- .../models/xglm/configuration_xglm.py | 2 +- .../models/xlm/configuration_xlm.py | 2 +- .../xlm_roberta/configuration_xlm_roberta.py | 2 +- .../configuration_xlm_roberta_xl.py | 2 +- .../models/xlnet/configuration_xlnet.py | 2 +- .../models/xmod/configuration_xmod.py | 2 +- .../models/yoso/configuration_yoso.py | 2 +- .../models/youtu/configuration_youtu.py | 2 +- .../models/zamba/configuration_zamba.py | 2 +- 92 files changed, 117 insertions(+), 101 deletions(-) diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 408d53e5a24a..d172f76a8f59 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -26,6 +26,7 @@ from huggingface_hub.dataclasses import strict from packaging import version +from .tokenization_utils_base import PreTrainedTokenizer from . import __version__ from .dynamic_module_utils import custom_object_save from .generation.configuration_utils import GenerationConfig @@ -189,7 +190,7 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin): problem_type: Literal["regression", "single_label_classification", "multi_label_classification"] | None = None # Tokenizer kwargs - tokenizer_class: str | None = None + tokenizer_class: str | PreTrainedTokenizer | None = None def __post_init__(self, **kwargs): # BC for the `torch_dtype` argument instead of the simpler `dtype` @@ -1265,3 +1266,18 @@ def recursive_diff_dict(dict_a, dict_b, config_obj=None): # The alias is only here for BC - we did not have the correct CamelCasing before PretrainedConfig = PreTrainedConfig + + +def layer_type_validation(layer_types: list[str], num_hidden_layers: int | None = None, attention: bool = True): + logger.warning( + "`layer_type_validation` is deprecated and will be removed in v5.20. " + "Use `PreTrainedConfig.validate_layer_type` instead" + ) + + if not all(layer_type in ALLOWED_LAYER_TYPES for layer_type in layer_types): + raise ValueError(f"The `layer_types` entries must be in {ALLOWED_LAYER_TYPES}") + if num_hidden_layers is not None and num_hidden_layers != len(layer_types): + raise ValueError( + f"`num_hidden_layers` ({num_hidden_layers}) must be equal to the number of layer types " + f"({len(layer_types)})" + ) diff --git a/src/transformers/models/blenderbot/configuration_blenderbot.py b/src/transformers/models/blenderbot/configuration_blenderbot.py index 2d97047e45f3..9d7cfedfc268 100644 --- a/src/transformers/models/blenderbot/configuration_blenderbot.py +++ b/src/transformers/models/blenderbot/configuration_blenderbot.py @@ -73,7 +73,7 @@ class BlenderbotConfig(PreTrainedConfig): bos_token_id: int | None = 1 eos_token_id: int | list[int] | None = 2 encoder_no_repeat_ngram_size: int = 3 - forced_eos_token_id: int | None = 2 + forced_eos_token_id: int | list[int] | None = 2 is_decoder: bool = False tie_word_embeddings: bool = True diff --git a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py index b8a0525a8d5d..b0ef05636fe6 100644 --- a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py @@ -69,7 +69,7 @@ class BlenderbotSmallConfig(PreTrainedConfig): pad_token_id: int | None = 0 bos_token_id: int | None = 1 eos_token_id: int | list[int] | None = 2 - forced_eos_token_id: int | None = 2 + forced_eos_token_id: int | list[int] | None = 2 is_decoder: bool = False tie_word_embeddings: bool = True diff --git a/src/transformers/models/clvp/configuration_clvp.py b/src/transformers/models/clvp/configuration_clvp.py index 16beba7a29d2..213ff04a8e02 100644 --- a/src/transformers/models/clvp/configuration_clvp.py +++ b/src/transformers/models/clvp/configuration_clvp.py @@ -174,7 +174,7 @@ class ClvpDecoderConfig(PreTrainedConfig): summary_first_dropout: float | int = 0.1 use_cache: bool = True bos_token_id: int | None = 8192 - eos_token_id: int | None = 8193 + eos_token_id: int | list[int] | None = 8193 pad_token_id: int | None = None feature_size: int = 80 use_attention_bias: bool = True diff --git a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py index 5504bf9ed76a..6f9eb334495c 100644 --- a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py +++ b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py @@ -78,7 +78,7 @@ class DeepseekV3Config(PreTrainedConfig): n_routed_experts: int = 256 routed_scaling_factor: float = 2.5 kv_lora_rank: int = 512 - q_lora_rank: int = 1536 + q_lora_rank: int | None = 1536 qk_rope_head_dim: int = 64 v_head_dim: int | None = 128 qk_nope_head_dim: int = 128 diff --git a/src/transformers/models/dia/configuration_dia.py b/src/transformers/models/dia/configuration_dia.py index 3bfdc0efc3e3..fa2b401b837a 100644 --- a/src/transformers/models/dia/configuration_dia.py +++ b/src/transformers/models/dia/configuration_dia.py @@ -78,7 +78,7 @@ class DiaDecoderConfig(PreTrainedConfig): use_cache: bool = True is_encoder_decoder: bool = True pad_token_id: int | None = 1025 - eos_token_id: int | None = 1024 + eos_token_id: int | list[int] | None = 1024 bos_token_id: int | None = 1026 diff --git a/src/transformers/models/eurobert/configuration_eurobert.py b/src/transformers/models/eurobert/configuration_eurobert.py index f85dccfd25c6..8781bb2e1468 100644 --- a/src/transformers/models/eurobert/configuration_eurobert.py +++ b/src/transformers/models/eurobert/configuration_eurobert.py @@ -61,7 +61,7 @@ class EuroBertConfig(LlamaConfig): initializer_range: float = 0.02 rms_norm_eps: float = 1e-05 bos_token_id: int | None = 128000 - eos_token_id: int | None = 128001 + eos_token_id: int | list[int] | None = 128001 pad_token_id: int | None = 128001 mask_token_id: int = 128002 pretraining_tp: int = 1 diff --git a/src/transformers/models/eurobert/modular_eurobert.py b/src/transformers/models/eurobert/modular_eurobert.py index 26105898684c..a02b511a1e24 100644 --- a/src/transformers/models/eurobert/modular_eurobert.py +++ b/src/transformers/models/eurobert/modular_eurobert.py @@ -63,7 +63,7 @@ class EuroBertConfig(LlamaConfig): initializer_range: float = 0.02 rms_norm_eps: float = 1e-05 bos_token_id: int | None = 128000 - eos_token_id: int | None = 128001 + eos_token_id: int | list[int] | None = 128001 pad_token_id: int | None = 128001 mask_token_id: int = 128002 pretraining_tp: int = 1 diff --git a/src/transformers/models/exaone_moe/configuration_exaone_moe.py b/src/transformers/models/exaone_moe/configuration_exaone_moe.py index 788c58044652..eac3cb0e9464 100644 --- a/src/transformers/models/exaone_moe/configuration_exaone_moe.py +++ b/src/transformers/models/exaone_moe/configuration_exaone_moe.py @@ -94,7 +94,7 @@ class ExaoneMoeConfig(PreTrainedConfig): rms_norm_eps: float = 1e-5 use_cache: bool = True bos_token_id: int | None = 1 - eos_token_id: int | None = 53 + eos_token_id: int | list[int] | None = 53 pad_token_id: int | None = 0 tie_word_embeddings: bool = False rope_parameters: dict | None = None diff --git a/src/transformers/models/exaone_moe/modular_exaone_moe.py b/src/transformers/models/exaone_moe/modular_exaone_moe.py index 3b70ccca4f70..1bf6a61c7ab5 100644 --- a/src/transformers/models/exaone_moe/modular_exaone_moe.py +++ b/src/transformers/models/exaone_moe/modular_exaone_moe.py @@ -92,7 +92,7 @@ class ExaoneMoeConfig(Exaone4Config): rms_norm_eps: float = 1e-5 use_cache: bool = True bos_token_id: int | None = 1 - eos_token_id: int | None = 53 + eos_token_id: int | list[int] | None = 53 pad_token_id: int | None = 0 tie_word_embeddings: bool = False rope_parameters: dict | None = None diff --git a/src/transformers/models/flaubert/configuration_flaubert.py b/src/transformers/models/flaubert/configuration_flaubert.py index 87085038903d..f6ffc43a830d 100644 --- a/src/transformers/models/flaubert/configuration_flaubert.py +++ b/src/transformers/models/flaubert/configuration_flaubert.py @@ -136,7 +136,7 @@ class FlaubertConfig(PreTrainedConfig): lang_id: int = 0 pad_token_id: int | None = 2 bos_token_id: int | None = 0 - eos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 1 tie_word_embeddings: bool = True diff --git a/src/transformers/models/fnet/configuration_fnet.py b/src/transformers/models/fnet/configuration_fnet.py index 90586bc6f91c..3bf7f0c0e0ad 100644 --- a/src/transformers/models/fnet/configuration_fnet.py +++ b/src/transformers/models/fnet/configuration_fnet.py @@ -62,7 +62,7 @@ class FNetConfig(PreTrainedConfig): tpu_short_seq_length: int = 512 pad_token_id: int | None = 3 bos_token_id: int | None = 1 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 tie_word_embeddings: bool = True diff --git a/src/transformers/models/fsmt/configuration_fsmt.py b/src/transformers/models/fsmt/configuration_fsmt.py index e889e65a5219..5ece7129a8f9 100644 --- a/src/transformers/models/fsmt/configuration_fsmt.py +++ b/src/transformers/models/fsmt/configuration_fsmt.py @@ -97,8 +97,8 @@ class FSMTConfig(PreTrainedConfig): use_cache: bool = True pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 - forced_eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 + forced_eos_token_id: int | list[int] | None = 2 def __post_init__(self, **kwargs): kwargs.pop("decoder", None) # delete unused kwargs diff --git a/src/transformers/models/git/configuration_git.py b/src/transformers/models/git/configuration_git.py index 804642bbe052..5ab6513c2f13 100644 --- a/src/transformers/models/git/configuration_git.py +++ b/src/transformers/models/git/configuration_git.py @@ -98,7 +98,7 @@ class GitConfig(PreTrainedConfig): use_cache: bool = True tie_word_embeddings: bool = False bos_token_id: int | None = 101 - eos_token_id: int | None = 102 + eos_token_id: int | list[int] | None = 102 num_image_with_embedding: int | None = None def __post_init__(self, **kwargs): diff --git a/src/transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py b/src/transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py index c1a82f04315f..dd2b73e7020e 100644 --- a/src/transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +++ b/src/transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py @@ -84,7 +84,7 @@ class Glm4MoeLiteConfig(PreTrainedConfig): n_routed_experts: int = 64 routed_scaling_factor: float = 1.8 kv_lora_rank: int = 512 - q_lora_rank: int = 768 + q_lora_rank: int | None = 768 qk_rope_head_dim: int = 64 v_head_dim: int = 256 qk_nope_head_dim: int = 192 diff --git a/src/transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py b/src/transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py index 1794df505be8..2f6b92feae85 100644 --- a/src/transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +++ b/src/transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py @@ -92,7 +92,7 @@ class Glm4MoeLiteConfig(PreTrainedConfig): n_routed_experts: int = 64 routed_scaling_factor: float = 1.8 kv_lora_rank: int = 512 - q_lora_rank: int = 768 + q_lora_rank: int | None = 768 qk_rope_head_dim: int = 64 v_head_dim: int = 256 qk_nope_head_dim: int = 192 diff --git a/src/transformers/models/gpt2/configuration_gpt2.py b/src/transformers/models/gpt2/configuration_gpt2.py index dd81dea1f2d6..6bc5ecd69423 100644 --- a/src/transformers/models/gpt2/configuration_gpt2.py +++ b/src/transformers/models/gpt2/configuration_gpt2.py @@ -95,7 +95,7 @@ class GPT2Config(PreTrainedConfig): scale_attn_weights: bool = True use_cache: bool = True bos_token_id: int | None = 50256 - eos_token_id: int | None = 50256 + eos_token_id: int | list[int] | None = 50256 pad_token_id: int | None = None scale_attn_by_inverse_layer_idx: bool = False reorder_and_upcast_attn: bool = False diff --git a/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py index 3b8c9b27871a..d91052c36309 100644 --- a/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +++ b/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py @@ -73,7 +73,7 @@ class GPTBigCodeConfig(PreTrainedConfig): scale_attn_weights: bool = True use_cache: bool = True bos_token_id: int | None = 50256 - eos_token_id: int | None = 50256 + eos_token_id: int | list[int] | None = 50256 pad_token_id: int | None = None attention_softmax_in_fp32: bool = True scale_attention_softmax_in_fp32: bool = True diff --git a/src/transformers/models/gpt_neo/configuration_gpt_neo.py b/src/transformers/models/gpt_neo/configuration_gpt_neo.py index b06bcf5f1ab5..b02781b55a42 100644 --- a/src/transformers/models/gpt_neo/configuration_gpt_neo.py +++ b/src/transformers/models/gpt_neo/configuration_gpt_neo.py @@ -66,7 +66,7 @@ class GPTNeoConfig(PreTrainedConfig): initializer_range: float = 0.02 use_cache: bool = True bos_token_id: int | None = 50256 - eos_token_id: int | None = 50256 + eos_token_id: int | list[int] | None = 50256 pad_token_id: int | None = None tie_word_embeddings: bool = True diff --git a/src/transformers/models/gptj/configuration_gptj.py b/src/transformers/models/gptj/configuration_gptj.py index b1ee0f363ebc..b63f60eb7b94 100644 --- a/src/transformers/models/gptj/configuration_gptj.py +++ b/src/transformers/models/gptj/configuration_gptj.py @@ -64,7 +64,7 @@ class GPTJConfig(PreTrainedConfig): initializer_range: float = 0.02 use_cache: bool = True bos_token_id: int | None = 50256 - eos_token_id: int | None = 50256 + eos_token_id: int | list[int] | None = 50256 pad_token_id: int | None = None tie_word_embeddings: bool = False diff --git a/src/transformers/models/groupvit/configuration_groupvit.py b/src/transformers/models/groupvit/configuration_groupvit.py index 262100f98663..70a384f88ea1 100644 --- a/src/transformers/models/groupvit/configuration_groupvit.py +++ b/src/transformers/models/groupvit/configuration_groupvit.py @@ -57,7 +57,7 @@ class GroupViTTextConfig(PreTrainedConfig): initializer_factor: float = 1.0 pad_token_id: int | None = 1 bos_token_id: int | None = 49406 - eos_token_id: int | None = 49407 + eos_token_id: int | list[int] | None = 49407 @auto_docstring(checkpoint="nvidia/groupvit-gcc-yfcc") diff --git a/src/transformers/models/higgs_audio_v2/configuration_higgs_audio_v2.py b/src/transformers/models/higgs_audio_v2/configuration_higgs_audio_v2.py index 4cc4984a20aa..0963b488c0d7 100644 --- a/src/transformers/models/higgs_audio_v2/configuration_higgs_audio_v2.py +++ b/src/transformers/models/higgs_audio_v2/configuration_higgs_audio_v2.py @@ -86,7 +86,7 @@ class HiggsAudioV2Config(PreTrainedConfig): use_cache: bool = True pad_token_id: int | None = 128001 bos_token_id: int | None = 1 - eos_token_id: int | None = 128009 + eos_token_id: int | list[int] | None = 128009 pretraining_tp: int | None = 1 tie_word_embeddings: bool = False rope_parameters: RopeParameters | dict | None = None diff --git a/src/transformers/models/higgs_audio_v2/modular_higgs_audio_v2.py b/src/transformers/models/higgs_audio_v2/modular_higgs_audio_v2.py index f7430e850dc0..da33994b6767 100644 --- a/src/transformers/models/higgs_audio_v2/modular_higgs_audio_v2.py +++ b/src/transformers/models/higgs_audio_v2/modular_higgs_audio_v2.py @@ -75,7 +75,7 @@ class HiggsAudioV2Config(LlamaConfig): num_attention_heads: int = 24 num_key_value_heads: int = 8 pad_token_id: int | None = 128001 - eos_token_id: int | None = 128009 + eos_token_id: int | list[int] | None = 128009 head_dim: int | None = 128 num_codebooks: int = 8 codebook_size: int = 1024 diff --git a/src/transformers/models/hubert/configuration_hubert.py b/src/transformers/models/hubert/configuration_hubert.py index 016b698ae6a8..18be6317c132 100644 --- a/src/transformers/models/hubert/configuration_hubert.py +++ b/src/transformers/models/hubert/configuration_hubert.py @@ -159,7 +159,7 @@ class HubertConfig(PreTrainedConfig): classifier_proj_size: int = 256 pad_token_id: int | None = 0 bos_token_id: int | None = 1 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 def __post_init__(self, **kwargs): self.num_feat_extract_layers = len(self.conv_dim) diff --git a/src/transformers/models/ibert/configuration_ibert.py b/src/transformers/models/ibert/configuration_ibert.py index ec0674b31247..1d8a9b95e1f4 100644 --- a/src/transformers/models/ibert/configuration_ibert.py +++ b/src/transformers/models/ibert/configuration_ibert.py @@ -53,7 +53,7 @@ class IBertConfig(PreTrainedConfig): layer_norm_eps: float = 1e-12 pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 quant_mode: bool = False force_dequant: str = "none" diff --git a/src/transformers/models/idefics/configuration_idefics.py b/src/transformers/models/idefics/configuration_idefics.py index 4375025c6560..f4778fce2cf9 100644 --- a/src/transformers/models/idefics/configuration_idefics.py +++ b/src/transformers/models/idefics/configuration_idefics.py @@ -133,7 +133,7 @@ class IdeficsConfig(PreTrainedConfig): use_cache: bool = True pad_token_id: int | None = 0 bos_token_id: int | None = 1 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 tie_word_embeddings: bool = False cross_layer_interval: int = 1 qk_layer_norms: bool = False diff --git a/src/transformers/models/jamba/configuration_jamba.py b/src/transformers/models/jamba/configuration_jamba.py index a5dfb6fcd03c..8fdb63d4af5b 100644 --- a/src/transformers/models/jamba/configuration_jamba.py +++ b/src/transformers/models/jamba/configuration_jamba.py @@ -62,7 +62,7 @@ class JambaConfig(PreTrainedConfig): router_aux_loss_coef: float = 0.001 pad_token_id: int | None = 0 bos_token_id: int | None = 1 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 max_position_embeddings: int = 262144 attention_dropout: float | int = 0.0 num_experts_per_tok: int = 2 diff --git a/src/transformers/models/jina_embeddings_v3/configuration_jina_embeddings_v3.py b/src/transformers/models/jina_embeddings_v3/configuration_jina_embeddings_v3.py index fea8d7adbf02..10209ef607bd 100644 --- a/src/transformers/models/jina_embeddings_v3/configuration_jina_embeddings_v3.py +++ b/src/transformers/models/jina_embeddings_v3/configuration_jina_embeddings_v3.py @@ -61,7 +61,7 @@ class JinaEmbeddingsV3Config(PreTrainedConfig): layer_norm_eps: float = 1e-5 pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 use_cache: bool = True classifier_dropout: float | int | None = None tie_word_embeddings: bool = True diff --git a/src/transformers/models/kosmos2/configuration_kosmos2.py b/src/transformers/models/kosmos2/configuration_kosmos2.py index 8485ac91e603..39b1459fb0f5 100644 --- a/src/transformers/models/kosmos2/configuration_kosmos2.py +++ b/src/transformers/models/kosmos2/configuration_kosmos2.py @@ -56,7 +56,7 @@ class Kosmos2TextConfig(PreTrainedConfig): use_cache: bool = True pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 add_cross_attention: bool = False diff --git a/src/transformers/models/kosmos2_5/configuration_kosmos2_5.py b/src/transformers/models/kosmos2_5/configuration_kosmos2_5.py index b0cbe9551f66..5b8d099d9569 100644 --- a/src/transformers/models/kosmos2_5/configuration_kosmos2_5.py +++ b/src/transformers/models/kosmos2_5/configuration_kosmos2_5.py @@ -56,7 +56,7 @@ class Kosmos2_5TextConfig(PreTrainedConfig): use_cache: bool = True pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 @auto_docstring(checkpoint="microsoft/kosmos-2.5") diff --git a/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py b/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py index 4ee3e7f4e9bb..010903e7beee 100644 --- a/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py @@ -80,7 +80,7 @@ class LayoutLMv3Config(PreTrainedConfig): layer_norm_eps: float = 1e-5 pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 max_2d_position_embeddings: int = 1024 coordinate_size: int = 128 shape_size: int = 128 diff --git a/src/transformers/models/led/configuration_led.py b/src/transformers/models/led/configuration_led.py index ffe3314c4ade..b0466a2e13cc 100644 --- a/src/transformers/models/led/configuration_led.py +++ b/src/transformers/models/led/configuration_led.py @@ -78,7 +78,7 @@ class LEDConfig(PreTrainedConfig): classifier_dropout: float | int = 0.0 pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 attention_window: list[int] | int = 512 tie_word_embeddings: bool = True diff --git a/src/transformers/models/llama4/configuration_llama4.py b/src/transformers/models/llama4/configuration_llama4.py index 10a3361861db..09547826685d 100644 --- a/src/transformers/models/llama4/configuration_llama4.py +++ b/src/transformers/models/llama4/configuration_llama4.py @@ -151,7 +151,7 @@ class Llama4TextConfig(PreTrainedConfig): use_cache: bool = True pad_token_id: int | None = None bos_token_id: int | None = 1 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 tie_word_embeddings: bool = False attention_dropout: float | int = 0.0 num_experts_per_tok: int = 1 diff --git a/src/transformers/models/longcat_flash/configuration_longcat_flash.py b/src/transformers/models/longcat_flash/configuration_longcat_flash.py index d49c1778f08c..6ed6296d947c 100644 --- a/src/transformers/models/longcat_flash/configuration_longcat_flash.py +++ b/src/transformers/models/longcat_flash/configuration_longcat_flash.py @@ -91,7 +91,7 @@ class LongcatFlashConfig(PreTrainedConfig): attention_bias: bool = False attention_dropout: float | int = 0.0 ffn_hidden_size: int = 12288 - q_lora_rank: int = 1536 + q_lora_rank: int | None = 1536 kv_lora_rank: int = 512 qk_nope_head_dim: int = 128 qk_rope_head_dim: int = 64 diff --git a/src/transformers/models/longt5/configuration_longt5.py b/src/transformers/models/longt5/configuration_longt5.py index aa5cd2a75e80..96c01a147981 100644 --- a/src/transformers/models/longt5/configuration_longt5.py +++ b/src/transformers/models/longt5/configuration_longt5.py @@ -72,7 +72,7 @@ class LongT5Config(PreTrainedConfig): encoder_attention_type: str = "local" use_cache: bool = True pad_token_id: int | None = 0 - eos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 1 bos_token_id: int | None = None is_decoder: bool = False tie_word_embeddings: bool = True diff --git a/src/transformers/models/luke/configuration_luke.py b/src/transformers/models/luke/configuration_luke.py index 51c2dfaf1be4..560d4fbf4c43 100644 --- a/src/transformers/models/luke/configuration_luke.py +++ b/src/transformers/models/luke/configuration_luke.py @@ -68,7 +68,7 @@ class LukeConfig(PreTrainedConfig): classifier_dropout: float | int | None = None pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 tie_word_embeddings: bool = True diff --git a/src/transformers/models/m2m_100/configuration_m2m_100.py b/src/transformers/models/m2m_100/configuration_m2m_100.py index 498c4a9a3562..d688a53162cb 100644 --- a/src/transformers/models/m2m_100/configuration_m2m_100.py +++ b/src/transformers/models/m2m_100/configuration_m2m_100.py @@ -68,7 +68,7 @@ class M2M100Config(PreTrainedConfig): scale_embedding: int = True pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 tie_word_embeddings: bool = True diff --git a/src/transformers/models/mamba2/configuration_mamba2.py b/src/transformers/models/mamba2/configuration_mamba2.py index 8f289a31caa3..d692c1b90c9b 100644 --- a/src/transformers/models/mamba2/configuration_mamba2.py +++ b/src/transformers/models/mamba2/configuration_mamba2.py @@ -68,7 +68,7 @@ class Mamba2Config(PreTrainedConfig): layer_norm_epsilon: float = 1e-5 pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 expand: int = 2 conv_kernel: int = 4 n_groups: int = 8 diff --git a/src/transformers/models/marian/configuration_marian.py b/src/transformers/models/marian/configuration_marian.py index 90e796fa4d3e..2a74319c346c 100644 --- a/src/transformers/models/marian/configuration_marian.py +++ b/src/transformers/models/marian/configuration_marian.py @@ -73,9 +73,9 @@ class MarianConfig(PreTrainedConfig): decoder_start_token_id: int = 58100 scale_embedding: bool = False pad_token_id: int | None = 58100 - eos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 0 bos_token_id: int | None = None - forced_eos_token_id: int | None = 0 + forced_eos_token_id: int | list[int] | None = 0 share_encoder_decoder_embeddings: bool = True is_decoder: bool = False tie_word_embeddings: bool = True diff --git a/src/transformers/models/markuplm/configuration_markuplm.py b/src/transformers/models/markuplm/configuration_markuplm.py index fecd089f55de..986acb774465 100644 --- a/src/transformers/models/markuplm/configuration_markuplm.py +++ b/src/transformers/models/markuplm/configuration_markuplm.py @@ -75,7 +75,7 @@ class MarkupLMConfig(PreTrainedConfig): layer_norm_eps: float = 1e-12 pad_token_id: int | None = 0 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 max_xpath_tag_unit_embeddings: int = 256 max_xpath_subs_unit_embeddings: int = 1024 tag_pad_id: int = 216 diff --git a/src/transformers/models/mbart/configuration_mbart.py b/src/transformers/models/mbart/configuration_mbart.py index 094cc06742f0..a91e0d4f5e71 100644 --- a/src/transformers/models/mbart/configuration_mbart.py +++ b/src/transformers/models/mbart/configuration_mbart.py @@ -68,9 +68,9 @@ class MBartConfig(PreTrainedConfig): scale_embedding: int = False pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 decoder_start_token_id: int | None = None - forced_eos_token_id: int | None = 2 + forced_eos_token_id: int | list[int] | None = 2 is_decoder: bool = False tie_word_embeddings: bool = True diff --git a/src/transformers/models/mistral4/configuration_mistral4.py b/src/transformers/models/mistral4/configuration_mistral4.py index ceb252929f80..54433e30c2e5 100644 --- a/src/transformers/models/mistral4/configuration_mistral4.py +++ b/src/transformers/models/mistral4/configuration_mistral4.py @@ -77,7 +77,7 @@ class Mistral4Config(PreTrainedConfig): n_routed_experts: int = 128 routed_scaling_factor: float = 1.0 kv_lora_rank: int = 256 - q_lora_rank: int = 1024 + q_lora_rank: int | None = 1024 qk_rope_head_dim: int = 64 v_head_dim: int | None = 128 qk_nope_head_dim: int = 64 @@ -93,7 +93,7 @@ class Mistral4Config(PreTrainedConfig): use_cache: bool = True pad_token_id: int | None = 11 bos_token_id: int | None = 1 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 pretraining_tp: int | None = 1 tie_word_embeddings: bool = False rope_parameters: RopeParameters | dict | None = None diff --git a/src/transformers/models/mpnet/configuration_mpnet.py b/src/transformers/models/mpnet/configuration_mpnet.py index c4c34d61837e..0f2456939e17 100644 --- a/src/transformers/models/mpnet/configuration_mpnet.py +++ b/src/transformers/models/mpnet/configuration_mpnet.py @@ -58,7 +58,7 @@ class MPNetConfig(PreTrainedConfig): relative_attention_num_buckets: int = 32 pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 tie_word_embeddings: bool = True diff --git a/src/transformers/models/mra/configuration_mra.py b/src/transformers/models/mra/configuration_mra.py index 8b444dc9ab1f..d9058c33c458 100644 --- a/src/transformers/models/mra/configuration_mra.py +++ b/src/transformers/models/mra/configuration_mra.py @@ -68,7 +68,7 @@ class MraConfig(PreTrainedConfig): initial_prior_diagonal_n_blocks: int = 0 pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 add_cross_attention: bool = False tie_word_embeddings: bool = True diff --git a/src/transformers/models/mt5/configuration_mt5.py b/src/transformers/models/mt5/configuration_mt5.py index 971402b2e742..4545bd03331b 100644 --- a/src/transformers/models/mt5/configuration_mt5.py +++ b/src/transformers/models/mt5/configuration_mt5.py @@ -61,7 +61,7 @@ class MT5Config(PreTrainedConfig): tie_word_embeddings: bool = True bos_token_id: int | None = None pad_token_id: int | None = 0 - eos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 1 decoder_start_token_id: int | None = 0 classifier_dropout: float | int = 0.0 is_decoder: bool = False diff --git a/src/transformers/models/mvp/configuration_mvp.py b/src/transformers/models/mvp/configuration_mvp.py index d0ae7390e664..88abc197f210 100644 --- a/src/transformers/models/mvp/configuration_mvp.py +++ b/src/transformers/models/mvp/configuration_mvp.py @@ -74,7 +74,7 @@ class MvpConfig(PreTrainedConfig): use_cache: bool = True pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 is_encoder_decoder: int = True decoder_start_token_id: int | None = 2 use_prompt: bool = False diff --git a/src/transformers/models/nemotron_h/configuration_nemotron_h.py b/src/transformers/models/nemotron_h/configuration_nemotron_h.py index e36b4d1d4fcd..99c7a40457bc 100644 --- a/src/transformers/models/nemotron_h/configuration_nemotron_h.py +++ b/src/transformers/models/nemotron_h/configuration_nemotron_h.py @@ -90,7 +90,7 @@ class NemotronHConfig(PreTrainedConfig): num_logits_to_keep: int = 1 pad_token_id: int | None = 0 bos_token_id: int | None = 1 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 num_attention_heads: int = 32 num_key_value_heads: int = 8 head_dim: int = 128 diff --git a/src/transformers/models/nllb_moe/configuration_nllb_moe.py b/src/transformers/models/nllb_moe/configuration_nllb_moe.py index 9128e185d36e..1401328344ef 100644 --- a/src/transformers/models/nllb_moe/configuration_nllb_moe.py +++ b/src/transformers/models/nllb_moe/configuration_nllb_moe.py @@ -114,7 +114,7 @@ class NllbMoeConfig(PreTrainedConfig): moe_token_dropout: float | int = 0.2 pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 tie_word_embeddings: bool = True output_router_logits: bool = False diff --git a/src/transformers/models/nystromformer/configuration_nystromformer.py b/src/transformers/models/nystromformer/configuration_nystromformer.py index e935c7f253c9..ffccb3c10f1c 100644 --- a/src/transformers/models/nystromformer/configuration_nystromformer.py +++ b/src/transformers/models/nystromformer/configuration_nystromformer.py @@ -69,7 +69,7 @@ class NystromformerConfig(PreTrainedConfig): layer_norm_eps: float = 1e-5 pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 add_cross_attention: bool = False tie_word_embeddings: bool = True diff --git a/src/transformers/models/olmo_hybrid/configuration_olmo_hybrid.py b/src/transformers/models/olmo_hybrid/configuration_olmo_hybrid.py index 29a6f3a503fa..83a7d046cdda 100644 --- a/src/transformers/models/olmo_hybrid/configuration_olmo_hybrid.py +++ b/src/transformers/models/olmo_hybrid/configuration_olmo_hybrid.py @@ -100,7 +100,7 @@ class OlmoHybridConfig(PreTrainedConfig): use_cache: bool = True pad_token_id: int | None = 100277 bos_token_id: int | None = None - eos_token_id: int | None = 100257 + eos_token_id: int | list[int] | None = 100257 tie_word_embeddings: bool = False rope_parameters: RopeParameters | dict | None = None attention_bias: bool = False diff --git a/src/transformers/models/olmo_hybrid/modular_olmo_hybrid.py b/src/transformers/models/olmo_hybrid/modular_olmo_hybrid.py index f9c9fc9dd1f3..d8fb6c5305a2 100644 --- a/src/transformers/models/olmo_hybrid/modular_olmo_hybrid.py +++ b/src/transformers/models/olmo_hybrid/modular_olmo_hybrid.py @@ -138,7 +138,7 @@ class OlmoHybridConfig(LlamaConfig): max_position_embeddings: int = 65536 pad_token_id: int | None = 100277 bos_token_id: int | None = None - eos_token_id: int | None = 100257 + eos_token_id: int | list[int] | None = 100257 rms_norm_eps: float = 1e-06 layer_types: list[str] | None = None linear_num_key_heads: int | None = None diff --git a/src/transformers/models/opt/configuration_opt.py b/src/transformers/models/opt/configuration_opt.py index f377b9c84134..76a94a1ae546 100644 --- a/src/transformers/models/opt/configuration_opt.py +++ b/src/transformers/models/opt/configuration_opt.py @@ -70,7 +70,7 @@ class OPTConfig(PreTrainedConfig): use_cache: bool = True pad_token_id: int | None = 1 bos_token_id: int | None = 2 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 enable_bias: bool = True layer_norm_elementwise_affine: bool = True tie_word_embeddings: bool = True diff --git a/src/transformers/models/owlv2/configuration_owlv2.py b/src/transformers/models/owlv2/configuration_owlv2.py index 439e3f2a8848..602f85b1b561 100644 --- a/src/transformers/models/owlv2/configuration_owlv2.py +++ b/src/transformers/models/owlv2/configuration_owlv2.py @@ -58,7 +58,7 @@ class Owlv2TextConfig(PreTrainedConfig): initializer_factor: float = 1.0 pad_token_id: int | None = 0 bos_token_id: int | None = 49406 - eos_token_id: int | None = 49407 + eos_token_id: int | list[int] | None = 49407 @strict(accept_kwargs=True) diff --git a/src/transformers/models/owlvit/configuration_owlvit.py b/src/transformers/models/owlvit/configuration_owlvit.py index 43dfcdc77b8c..31f5ec284fa7 100644 --- a/src/transformers/models/owlvit/configuration_owlvit.py +++ b/src/transformers/models/owlvit/configuration_owlvit.py @@ -57,7 +57,7 @@ class OwlViTTextConfig(PreTrainedConfig): initializer_factor: float = 1.0 pad_token_id: int | None = 0 bos_token_id: int | None = 49406 - eos_token_id: int | None = 49407 + eos_token_id: int | list[int] | None = 49407 @auto_docstring(checkpoint="google/owlvit-base-patch16") diff --git a/src/transformers/models/pegasus/configuration_pegasus.py b/src/transformers/models/pegasus/configuration_pegasus.py index 88c1529df6a0..47abd187bb04 100644 --- a/src/transformers/models/pegasus/configuration_pegasus.py +++ b/src/transformers/models/pegasus/configuration_pegasus.py @@ -67,8 +67,8 @@ class PegasusConfig(PreTrainedConfig): decoder_start_token_id: int | None = 0 scale_embedding: bool = False pad_token_id: int | None = 0 - eos_token_id: int | None = 1 - forced_eos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 1 + forced_eos_token_id: int | list[int] | None = 1 is_decoder: bool = False tie_word_embeddings: bool = True diff --git a/src/transformers/models/pegasus_x/configuration_pegasus_x.py b/src/transformers/models/pegasus_x/configuration_pegasus_x.py index 7b056bb4658b..ee8217119ecb 100644 --- a/src/transformers/models/pegasus_x/configuration_pegasus_x.py +++ b/src/transformers/models/pegasus_x/configuration_pegasus_x.py @@ -75,8 +75,8 @@ class PegasusXConfig(PreTrainedConfig): decoder_start_token_id: int | None = 0 scale_embedding: bool = True pad_token_id: int | None = 0 - eos_token_id: int | None = 1 - forced_eos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 1 + forced_eos_token_id: int | list[int] | None = 1 num_global_tokens: int = 32 block_size: int = 512 stagger_local_blocks: bool = True diff --git a/src/transformers/models/pix2struct/configuration_pix2struct.py b/src/transformers/models/pix2struct/configuration_pix2struct.py index 7064891e0c4e..5396570c1d34 100644 --- a/src/transformers/models/pix2struct/configuration_pix2struct.py +++ b/src/transformers/models/pix2struct/configuration_pix2struct.py @@ -75,7 +75,7 @@ class Pix2StructTextConfig(PreTrainedConfig): decoder_start_token_id: int = 0 use_cache: bool = False pad_token_id: int | None = 0 - eos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 1 bos_token_id: int | None = None tie_word_embeddings: bool = False is_decoder: bool = True diff --git a/src/transformers/models/plbart/configuration_plbart.py b/src/transformers/models/plbart/configuration_plbart.py index 01bb243d3bce..7e142829badd 100644 --- a/src/transformers/models/plbart/configuration_plbart.py +++ b/src/transformers/models/plbart/configuration_plbart.py @@ -69,8 +69,8 @@ class PLBartConfig(PreTrainedConfig): scale_embedding: bool = True pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 - forced_eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 + forced_eos_token_id: int | list[int] | None = 2 is_decoder: bool = False tie_word_embeddings: bool = True diff --git a/src/transformers/models/pop2piano/configuration_pop2piano.py b/src/transformers/models/pop2piano/configuration_pop2piano.py index 1e063ba5015b..9e1d2efe5d80 100644 --- a/src/transformers/models/pop2piano/configuration_pop2piano.py +++ b/src/transformers/models/pop2piano/configuration_pop2piano.py @@ -56,7 +56,7 @@ class Pop2PianoConfig(PreTrainedConfig): is_encoder_decoder: bool = True use_cache: bool = True pad_token_id: int | None = 0 - eos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 1 dense_act_fn: str = "relu" is_decoder: bool = False tie_word_embeddings: bool = True diff --git a/src/transformers/models/reformer/configuration_reformer.py b/src/transformers/models/reformer/configuration_reformer.py index c018bc940f99..f8df0902977a 100755 --- a/src/transformers/models/reformer/configuration_reformer.py +++ b/src/transformers/models/reformer/configuration_reformer.py @@ -120,7 +120,7 @@ class ReformerConfig(PreTrainedConfig): axial_pos_shape: list[int] | tuple[int, ...] = (64, 64) axial_pos_embds_dim: list[int] | tuple[int, ...] = (64, 192) chunk_size_lm_head: int = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 feed_forward_size: int = 512 hash_seed: int | None = None hidden_act: str = "relu" diff --git a/src/transformers/models/rembert/configuration_rembert.py b/src/transformers/models/rembert/configuration_rembert.py index 7a6b51e0f6d4..979858c9c650 100644 --- a/src/transformers/models/rembert/configuration_rembert.py +++ b/src/transformers/models/rembert/configuration_rembert.py @@ -63,7 +63,7 @@ class RemBertConfig(PreTrainedConfig): use_cache: bool = True pad_token_id: int | None = 0 bos_token_id: int | None = 312 - eos_token_id: int | None = 313 + eos_token_id: int | list[int] | None = 313 is_decoder: bool = False add_cross_attention: bool = False tie_word_embeddings: bool = False diff --git a/src/transformers/models/roberta/configuration_roberta.py b/src/transformers/models/roberta/configuration_roberta.py index 0c064b81b030..827b1046f779 100644 --- a/src/transformers/models/roberta/configuration_roberta.py +++ b/src/transformers/models/roberta/configuration_roberta.py @@ -55,7 +55,7 @@ class RobertaConfig(PreTrainedConfig): layer_norm_eps: float = 1e-12 pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 use_cache: bool = True classifier_dropout: float | int | None = None is_decoder: bool = False diff --git a/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py index 04935b8f23d0..287511ef669b 100644 --- a/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +++ b/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py @@ -56,7 +56,7 @@ class RobertaPreLayerNormConfig(PreTrainedConfig): layer_norm_eps: float = 1e-12 pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 use_cache: bool = True classifier_dropout: float | int | None = None is_decoder: bool = False diff --git a/src/transformers/models/rwkv/configuration_rwkv.py b/src/transformers/models/rwkv/configuration_rwkv.py index 56e325be105e..131065d464bd 100644 --- a/src/transformers/models/rwkv/configuration_rwkv.py +++ b/src/transformers/models/rwkv/configuration_rwkv.py @@ -59,7 +59,7 @@ class RwkvConfig(PreTrainedConfig): intermediate_size: int | None = None layer_norm_epsilon: float = 1e-5 bos_token_id: int | None = 0 - eos_token_id: int | None = 0 + eos_token_id: int | list[int] | None = 0 rescale_every: int = 6 tie_word_embeddings: bool = False use_cache: bool = True diff --git a/src/transformers/models/seamless_m4t/configuration_seamless_m4t.py b/src/transformers/models/seamless_m4t/configuration_seamless_m4t.py index cc230f549d3e..cab29e4201b3 100644 --- a/src/transformers/models/seamless_m4t/configuration_seamless_m4t.py +++ b/src/transformers/models/seamless_m4t/configuration_seamless_m4t.py @@ -181,7 +181,7 @@ class SeamlessM4TConfig(PreTrainedConfig): max_new_tokens: int | None = 256 pad_token_id: int | None = 0 bos_token_id: int | None = 2 - eos_token_id: int | None = 3 + eos_token_id: int | list[int] | None = 3 speech_encoder_layers: int = 24 speech_encoder_attention_heads: int = 16 speech_encoder_intermediate_size: int = 4096 @@ -202,7 +202,7 @@ class SeamlessM4TConfig(PreTrainedConfig): conv_depthwise_kernel_size: int = 31 t2u_bos_token_id: int | None = 0 t2u_pad_token_id: int | None = 1 - t2u_eos_token_id: int | None = 2 + t2u_eos_token_id: int | list[int] | None = 2 t2u_decoder_start_token_id: int = 2 t2u_max_new_tokens: int = 1024 t2u_encoder_layers: int = 6 diff --git a/src/transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py b/src/transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py index b0e1a7f8826c..650da169f1f0 100644 --- a/src/transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +++ b/src/transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py @@ -185,7 +185,7 @@ class SeamlessM4Tv2Config(PreTrainedConfig): max_new_tokens: int | None = 256 pad_token_id: int | None = 0 bos_token_id: int | None = 2 - eos_token_id: int | None = 3 + eos_token_id: int | list[int] | None = 3 speech_encoder_layers: int = 24 speech_encoder_attention_heads: int = 16 speech_encoder_intermediate_size: int = 4096 @@ -206,7 +206,7 @@ class SeamlessM4Tv2Config(PreTrainedConfig): speech_encoder_left_chunk_num: int = 128 t2u_bos_token_id: int | None = 0 t2u_pad_token_id: int | None = 1 - t2u_eos_token_id: int | None = 2 + t2u_eos_token_id: int | list[int] | None = 2 t2u_encoder_layers: int = 6 t2u_encoder_ffn_dim: int = 8192 t2u_encoder_attention_heads: int = 16 diff --git a/src/transformers/models/sew/configuration_sew.py b/src/transformers/models/sew/configuration_sew.py index 9c158e88d170..75eb11696359 100644 --- a/src/transformers/models/sew/configuration_sew.py +++ b/src/transformers/models/sew/configuration_sew.py @@ -148,7 +148,7 @@ class SEWConfig(PreTrainedConfig): classifier_proj_size: int = 256 pad_token_id: int | None = 0 bos_token_id: int | None = 1 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 def __post_init__(self, **kwargs): self.num_feat_extract_layers = len(self.conv_dim) diff --git a/src/transformers/models/sew_d/configuration_sew_d.py b/src/transformers/models/sew_d/configuration_sew_d.py index a5907624c4a0..334afa685931 100644 --- a/src/transformers/models/sew_d/configuration_sew_d.py +++ b/src/transformers/models/sew_d/configuration_sew_d.py @@ -168,7 +168,7 @@ class SEWDConfig(PreTrainedConfig): classifier_proj_size: int = 256 pad_token_id: int | None = 0 bos_token_id: int | None = 1 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 def __post_init__(self, **kwargs): self.num_feat_extract_layers = len(self.conv_dim) diff --git a/src/transformers/models/speech_to_text/configuration_speech_to_text.py b/src/transformers/models/speech_to_text/configuration_speech_to_text.py index 176954dc7ae9..573f6f31c955 100644 --- a/src/transformers/models/speech_to_text/configuration_speech_to_text.py +++ b/src/transformers/models/speech_to_text/configuration_speech_to_text.py @@ -84,7 +84,7 @@ class Speech2TextConfig(PreTrainedConfig): scale_embedding: bool = True pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 max_source_positions: int = 6000 max_target_positions: int = 1024 num_conv_layers: int = 2 diff --git a/src/transformers/models/speecht5/configuration_speecht5.py b/src/transformers/models/speecht5/configuration_speecht5.py index 22ca4ec5d0c3..9eff65923fea 100644 --- a/src/transformers/models/speecht5/configuration_speecht5.py +++ b/src/transformers/models/speecht5/configuration_speecht5.py @@ -175,7 +175,7 @@ class SpeechT5Config(PreTrainedConfig): mask_feature_min_masks: int = 0 pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 decoder_start_token_id: int | None = 2 num_mel_bins: int = 80 speech_decoder_prenet_layers: int = 2 diff --git a/src/transformers/models/switch_transformers/configuration_switch_transformers.py b/src/transformers/models/switch_transformers/configuration_switch_transformers.py index abec0cf4aabe..8e9cfdb53c94 100644 --- a/src/transformers/models/switch_transformers/configuration_switch_transformers.py +++ b/src/transformers/models/switch_transformers/configuration_switch_transformers.py @@ -84,7 +84,7 @@ class SwitchTransformersConfig(PreTrainedConfig): add_router_probs: bool = False use_cache: bool = True pad_token_id: int | None = 0 - eos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 1 bos_token_id: int | None = None tie_word_embeddings: bool = True is_decoder: bool = False diff --git a/src/transformers/models/t5/configuration_t5.py b/src/transformers/models/t5/configuration_t5.py index 35aca7536d48..f120670fe4bd 100644 --- a/src/transformers/models/t5/configuration_t5.py +++ b/src/transformers/models/t5/configuration_t5.py @@ -57,7 +57,7 @@ class T5Config(PreTrainedConfig): is_encoder_decoder: bool = True use_cache: bool = True pad_token_id: int | None = 0 - eos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 1 classifier_dropout: float | int = 0.0 is_decoder: bool = False diff --git a/src/transformers/models/trocr/configuration_trocr.py b/src/transformers/models/trocr/configuration_trocr.py index 014921933c7e..254aaa9221a9 100644 --- a/src/transformers/models/trocr/configuration_trocr.py +++ b/src/transformers/models/trocr/configuration_trocr.py @@ -70,7 +70,7 @@ class TrOCRConfig(PreTrainedConfig): layernorm_embedding: bool = True pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 cross_attention_hidden_size: int | None = None is_decoder: bool = False tie_word_embeddings: bool = True diff --git a/src/transformers/models/udop/configuration_udop.py b/src/transformers/models/udop/configuration_udop.py index a7cc63158ad2..5903147b7ca7 100644 --- a/src/transformers/models/udop/configuration_udop.py +++ b/src/transformers/models/udop/configuration_udop.py @@ -59,7 +59,7 @@ class UdopConfig(PreTrainedConfig): is_encoder_decoder: bool = True use_cache: bool = True pad_token_id: int | None = 0 - eos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 1 max_2d_position_embeddings: int = 1024 image_size: int | list[int] | tuple[int, int] = 224 patch_size: int | list[int] | tuple[int, int] = 16 diff --git a/src/transformers/models/umt5/configuration_umt5.py b/src/transformers/models/umt5/configuration_umt5.py index b792ff6f1930..1cbc3b3f61f5 100644 --- a/src/transformers/models/umt5/configuration_umt5.py +++ b/src/transformers/models/umt5/configuration_umt5.py @@ -59,7 +59,7 @@ class UMT5Config(PreTrainedConfig): use_cache: bool = True tokenizer_class: str = "T5Tokenizer" pad_token_id: int | None = 0 - eos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 1 decoder_start_token_id: int | None = 0 classifier_dropout: float | int = 0.0 is_decoder: bool = False diff --git a/src/transformers/models/unispeech/configuration_unispeech.py b/src/transformers/models/unispeech/configuration_unispeech.py index 974ec1f30126..28c341b8a1e7 100644 --- a/src/transformers/models/unispeech/configuration_unispeech.py +++ b/src/transformers/models/unispeech/configuration_unispeech.py @@ -180,7 +180,7 @@ class UniSpeechConfig(PreTrainedConfig): num_ctc_classes: int = 80 pad_token_id: int | None = 0 bos_token_id: int | None = 1 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 replace_prob: float = 0.5 def __post_init__(self, **kwargs): diff --git a/src/transformers/models/visual_bert/configuration_visual_bert.py b/src/transformers/models/visual_bert/configuration_visual_bert.py index 66cc58d11a3a..c8282a73ff68 100644 --- a/src/transformers/models/visual_bert/configuration_visual_bert.py +++ b/src/transformers/models/visual_bert/configuration_visual_bert.py @@ -68,7 +68,7 @@ class VisualBertConfig(PreTrainedConfig): special_visual_initialize: bool = True pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 tie_word_embeddings: bool = True diff --git a/src/transformers/models/wav2vec2/configuration_wav2vec2.py b/src/transformers/models/wav2vec2/configuration_wav2vec2.py index ddbb63331f6d..5611af57b7fd 100644 --- a/src/transformers/models/wav2vec2/configuration_wav2vec2.py +++ b/src/transformers/models/wav2vec2/configuration_wav2vec2.py @@ -212,7 +212,7 @@ class Wav2Vec2Config(PreTrainedConfig): xvector_output_dim: int = 512 pad_token_id: int | None = 0 bos_token_id: int | None = 1 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 add_adapter: bool = False adapter_kernel_size: int = 3 adapter_stride: int = 2 diff --git a/src/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py index 504122adb64f..0ec5a84345a0 100644 --- a/src/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +++ b/src/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py @@ -167,7 +167,7 @@ class Wav2Vec2BertConfig(PreTrainedConfig): xvector_output_dim: int = 512 pad_token_id: int | None = 0 bos_token_id: int | None = 1 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 add_adapter: bool = False adapter_kernel_size: int = 3 adapter_stride: int = 2 diff --git a/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py index 91ca1deb9491..f00e9fd05a82 100644 --- a/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +++ b/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py @@ -212,7 +212,7 @@ class Wav2Vec2ConformerConfig(PreTrainedConfig): xvector_output_dim: int = 512 pad_token_id: int | None = 0 bos_token_id: int | None = 1 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 add_adapter: bool = False adapter_kernel_size: int = 3 adapter_stride: int = 2 diff --git a/src/transformers/models/wavlm/configuration_wavlm.py b/src/transformers/models/wavlm/configuration_wavlm.py index 124820082a6f..f2bac57319e6 100644 --- a/src/transformers/models/wavlm/configuration_wavlm.py +++ b/src/transformers/models/wavlm/configuration_wavlm.py @@ -205,7 +205,7 @@ class WavLMConfig(PreTrainedConfig): num_ctc_classes: int = 80 pad_token_id: int | None = 0 bos_token_id: int | None = 1 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 add_adapter: bool = False adapter_kernel_size: int = 3 adapter_stride: int = 2 diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py index dd205ac36bec..b6ca13898185 100644 --- a/src/transformers/models/whisper/configuration_whisper.py +++ b/src/transformers/models/whisper/configuration_whisper.py @@ -148,7 +148,7 @@ class WhisperConfig(PreTrainedConfig): max_target_positions: int = 448 pad_token_id: int | None = 50256 bos_token_id: int | None = 50256 - eos_token_id: int | None = 50256 + eos_token_id: int | list[int] | None = 50256 suppress_tokens: list | None = None begin_suppress_tokens: list[int] | tuple[int, ...] | None = (220, 50256) use_weighted_layer_sum: bool = False diff --git a/src/transformers/models/x_clip/configuration_x_clip.py b/src/transformers/models/x_clip/configuration_x_clip.py index 6a25787a1000..315d8d4e48e6 100644 --- a/src/transformers/models/x_clip/configuration_x_clip.py +++ b/src/transformers/models/x_clip/configuration_x_clip.py @@ -57,7 +57,7 @@ class XCLIPTextConfig(PreTrainedConfig): initializer_factor: float = 1.0 pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 @auto_docstring(checkpoint="microsoft/xclip-base-patch32") diff --git a/src/transformers/models/xglm/configuration_xglm.py b/src/transformers/models/xglm/configuration_xglm.py index 26fa0bfd0c8c..3b12f4e46d57 100644 --- a/src/transformers/models/xglm/configuration_xglm.py +++ b/src/transformers/models/xglm/configuration_xglm.py @@ -64,7 +64,7 @@ class XGLMConfig(PreTrainedConfig): decoder_start_token_id: int = 2 pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 add_cross_attention: bool = False tie_word_embeddings: bool = True diff --git a/src/transformers/models/xlm/configuration_xlm.py b/src/transformers/models/xlm/configuration_xlm.py index c9a6a4ba59a6..dcada4a0bfee 100644 --- a/src/transformers/models/xlm/configuration_xlm.py +++ b/src/transformers/models/xlm/configuration_xlm.py @@ -138,7 +138,7 @@ class XLMConfig(PreTrainedConfig): lang_id: int = 0 pad_token_id: int | None = 2 bos_token_id: int | None = 0 - eos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 1 tie_word_embeddings: bool = True diff --git a/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py b/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py index c79bc9c6659f..c578e1966edb 100644 --- a/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py @@ -55,7 +55,7 @@ class XLMRobertaConfig(PreTrainedConfig): layer_norm_eps: float = 1e-12 pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 use_cache: bool = True classifier_dropout: float | int | None = None is_decoder: bool = False diff --git a/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py index c10ea187db5f..97d9ad87b2d5 100644 --- a/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +++ b/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py @@ -54,7 +54,7 @@ class XLMRobertaXLConfig(PreTrainedConfig): layer_norm_eps: float = 1e-05 pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 use_cache: bool = True classifier_dropout: float | int | None = None is_decoder: bool = False diff --git a/src/transformers/models/xlnet/configuration_xlnet.py b/src/transformers/models/xlnet/configuration_xlnet.py index a48ed19f6031..753e8b0d4d51 100644 --- a/src/transformers/models/xlnet/configuration_xlnet.py +++ b/src/transformers/models/xlnet/configuration_xlnet.py @@ -132,7 +132,7 @@ class XLNetConfig(PreTrainedConfig): end_n_top: int = 5 pad_token_id: int | None = 5 bos_token_id: int | None = 1 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 tie_word_embeddings: bool = True def __post_init__(self, **kwargs): diff --git a/src/transformers/models/xmod/configuration_xmod.py b/src/transformers/models/xmod/configuration_xmod.py index 885540e2a17d..7dedcf43a184 100644 --- a/src/transformers/models/xmod/configuration_xmod.py +++ b/src/transformers/models/xmod/configuration_xmod.py @@ -71,7 +71,7 @@ class XmodConfig(PreTrainedConfig): layer_norm_eps: float = 1e-12 pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 use_cache: bool = True classifier_dropout: float | int | None = None pre_norm: bool = False diff --git a/src/transformers/models/yoso/configuration_yoso.py b/src/transformers/models/yoso/configuration_yoso.py index f66e26783322..16e5c272a8f0 100644 --- a/src/transformers/models/yoso/configuration_yoso.py +++ b/src/transformers/models/yoso/configuration_yoso.py @@ -73,7 +73,7 @@ class YosoConfig(PreTrainedConfig): lsh_backward: bool = True pad_token_id: int | None = 1 bos_token_id: int | None = 0 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 add_cross_attention: bool = False tie_word_embeddings: bool = True diff --git a/src/transformers/models/youtu/configuration_youtu.py b/src/transformers/models/youtu/configuration_youtu.py index e9f2efb79697..50b02bebfaa7 100644 --- a/src/transformers/models/youtu/configuration_youtu.py +++ b/src/transformers/models/youtu/configuration_youtu.py @@ -69,7 +69,7 @@ class YoutuConfig(PreTrainedConfig): num_attention_heads: int = 16 num_key_value_heads: int = 16 kv_lora_rank: int = 512 - q_lora_rank: int = 1536 + q_lora_rank: int | None = 1536 qk_rope_head_dim: int = 64 v_head_dim: int | None = 128 qk_nope_head_dim: int = 128 diff --git a/src/transformers/models/zamba/configuration_zamba.py b/src/transformers/models/zamba/configuration_zamba.py index a1862cf4a39f..f0793f9c165e 100644 --- a/src/transformers/models/zamba/configuration_zamba.py +++ b/src/transformers/models/zamba/configuration_zamba.py @@ -72,7 +72,7 @@ class ZambaConfig(PreTrainedConfig): num_logits_to_keep: int = 1 pad_token_id: int | None = 0 bos_token_id: int | None = 1 - eos_token_id: int | None = 2 + eos_token_id: int | list[int] | None = 2 max_position_embeddings: int = 4096 attention_dropout: float | int = 0.0 attn_layer_period: int = 6 From e075431fefeef30c28805f07302af2f02930a947 Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 19 Mar 2026 11:04:40 +0100 Subject: [PATCH 2/2] oops, wrong name for class --- src/transformers/configuration_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index d172f76a8f59..a727e091bc89 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -26,12 +26,12 @@ from huggingface_hub.dataclasses import strict from packaging import version -from .tokenization_utils_base import PreTrainedTokenizer from . import __version__ from .dynamic_module_utils import custom_object_save from .generation.configuration_utils import GenerationConfig from .modeling_gguf_pytorch_utils import load_gguf_checkpoint from .modeling_rope_utils import RotaryEmbeddingConfigMixin +from .tokenization_utils_base import PreTrainedTokenizerBase from .utils import ( CONFIG_NAME, PushToHubMixin, @@ -190,7 +190,7 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin): problem_type: Literal["regression", "single_label_classification", "multi_label_classification"] | None = None # Tokenizer kwargs - tokenizer_class: str | PreTrainedTokenizer | None = None + tokenizer_class: str | PreTrainedTokenizerBase | None = None def __post_init__(self, **kwargs): # BC for the `torch_dtype` argument instead of the simpler `dtype`