Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion src/transformers/configuration_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from .generation.configuration_utils import GenerationConfig
from .modeling_gguf_pytorch_utils import load_gguf_checkpoint
from .modeling_rope_utils import RotaryEmbeddingConfigMixin
from .tokenization_utils_base import PreTrainedTokenizerBase
from .utils import (
CONFIG_NAME,
PushToHubMixin,
Expand Down Expand Up @@ -189,7 +190,7 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
problem_type: Literal["regression", "single_label_classification", "multi_label_classification"] | None = None

# Tokenizer kwargs
tokenizer_class: str | None = None
tokenizer_class: str | PreTrainedTokenizerBase | None = None

def __post_init__(self, **kwargs):
# BC for the `torch_dtype` argument instead of the simpler `dtype`
Expand Down Expand Up @@ -1265,3 +1266,18 @@ def recursive_diff_dict(dict_a, dict_b, config_obj=None):

# The alias is only here for BC - we did not have the correct CamelCasing before
PretrainedConfig = PreTrainedConfig


def layer_type_validation(layer_types: list[str], num_hidden_layers: int | None = None, attention: bool = True):
logger.warning(
"`layer_type_validation` is deprecated and will be removed in v5.20. "
"Use `PreTrainedConfig.validate_layer_type` instead"
)

if not all(layer_type in ALLOWED_LAYER_TYPES for layer_type in layer_types):
raise ValueError(f"The `layer_types` entries must be in {ALLOWED_LAYER_TYPES}")
if num_hidden_layers is not None and num_hidden_layers != len(layer_types):
raise ValueError(
f"`num_hidden_layers` ({num_hidden_layers}) must be equal to the number of layer types "
f"({len(layer_types)})"
)
Comment thread
hmellor marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ class BlenderbotConfig(PreTrainedConfig):
bos_token_id: int | None = 1
eos_token_id: int | list[int] | None = 2
encoder_no_repeat_ngram_size: int = 3
forced_eos_token_id: int | None = 2
forced_eos_token_id: int | list[int] | None = 2
is_decoder: bool = False
tie_word_embeddings: bool = True

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class BlenderbotSmallConfig(PreTrainedConfig):
pad_token_id: int | None = 0
bos_token_id: int | None = 1
eos_token_id: int | list[int] | None = 2
forced_eos_token_id: int | None = 2
forced_eos_token_id: int | list[int] | None = 2
is_decoder: bool = False
tie_word_embeddings: bool = True

Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/clvp/configuration_clvp.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ class ClvpDecoderConfig(PreTrainedConfig):
summary_first_dropout: float | int = 0.1
use_cache: bool = True
bos_token_id: int | None = 8192
eos_token_id: int | None = 8193
eos_token_id: int | list[int] | None = 8193
pad_token_id: int | None = None
feature_size: int = 80
use_attention_bias: bool = True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ class DeepseekV3Config(PreTrainedConfig):
n_routed_experts: int = 256
routed_scaling_factor: float = 2.5
kv_lora_rank: int = 512
q_lora_rank: int = 1536
q_lora_rank: int | None = 1536
qk_rope_head_dim: int = 64
v_head_dim: int | None = 128
qk_nope_head_dim: int = 128
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/dia/configuration_dia.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ class DiaDecoderConfig(PreTrainedConfig):
use_cache: bool = True
is_encoder_decoder: bool = True
pad_token_id: int | None = 1025
eos_token_id: int | None = 1024
eos_token_id: int | list[int] | None = 1024
bos_token_id: int | None = 1026


Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/eurobert/configuration_eurobert.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ class EuroBertConfig(LlamaConfig):
initializer_range: float = 0.02
rms_norm_eps: float = 1e-05
bos_token_id: int | None = 128000
eos_token_id: int | None = 128001
eos_token_id: int | list[int] | None = 128001
pad_token_id: int | None = 128001
mask_token_id: int = 128002
pretraining_tp: int = 1
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/eurobert/modular_eurobert.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class EuroBertConfig(LlamaConfig):
initializer_range: float = 0.02
rms_norm_eps: float = 1e-05
bos_token_id: int | None = 128000
eos_token_id: int | None = 128001
eos_token_id: int | list[int] | None = 128001
pad_token_id: int | None = 128001
mask_token_id: int = 128002
pretraining_tp: int = 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ class ExaoneMoeConfig(PreTrainedConfig):
rms_norm_eps: float = 1e-5
use_cache: bool = True
bos_token_id: int | None = 1
eos_token_id: int | None = 53
eos_token_id: int | list[int] | None = 53
pad_token_id: int | None = 0
tie_word_embeddings: bool = False
rope_parameters: dict | None = None
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/exaone_moe/modular_exaone_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ class ExaoneMoeConfig(Exaone4Config):
rms_norm_eps: float = 1e-5
use_cache: bool = True
bos_token_id: int | None = 1
eos_token_id: int | None = 53
eos_token_id: int | list[int] | None = 53
pad_token_id: int | None = 0
tie_word_embeddings: bool = False
rope_parameters: dict | None = None
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/flaubert/configuration_flaubert.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ class FlaubertConfig(PreTrainedConfig):
lang_id: int = 0
pad_token_id: int | None = 2
bos_token_id: int | None = 0
eos_token_id: int | None = 1
eos_token_id: int | list[int] | None = 1
tie_word_embeddings: bool = True


Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/fnet/configuration_fnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ class FNetConfig(PreTrainedConfig):
tpu_short_seq_length: int = 512
pad_token_id: int | None = 3
bos_token_id: int | None = 1
eos_token_id: int | None = 2
eos_token_id: int | list[int] | None = 2
tie_word_embeddings: bool = True


Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/fsmt/configuration_fsmt.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@ class FSMTConfig(PreTrainedConfig):
use_cache: bool = True
pad_token_id: int | None = 1
bos_token_id: int | None = 0
eos_token_id: int | None = 2
forced_eos_token_id: int | None = 2
eos_token_id: int | list[int] | None = 2
forced_eos_token_id: int | list[int] | None = 2

def __post_init__(self, **kwargs):
kwargs.pop("decoder", None) # delete unused kwargs
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/git/configuration_git.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ class GitConfig(PreTrainedConfig):
use_cache: bool = True
tie_word_embeddings: bool = False
bos_token_id: int | None = 101
eos_token_id: int | None = 102
eos_token_id: int | list[int] | None = 102
num_image_with_embedding: int | None = None

def __post_init__(self, **kwargs):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class Glm4MoeLiteConfig(PreTrainedConfig):
n_routed_experts: int = 64
routed_scaling_factor: float = 1.8
kv_lora_rank: int = 512
q_lora_rank: int = 768
q_lora_rank: int | None = 768
qk_rope_head_dim: int = 64
v_head_dim: int = 256
qk_nope_head_dim: int = 192
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ class Glm4MoeLiteConfig(PreTrainedConfig):
n_routed_experts: int = 64
routed_scaling_factor: float = 1.8
kv_lora_rank: int = 512
q_lora_rank: int = 768
q_lora_rank: int | None = 768
qk_rope_head_dim: int = 64
v_head_dim: int = 256
qk_nope_head_dim: int = 192
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/gpt2/configuration_gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ class GPT2Config(PreTrainedConfig):
scale_attn_weights: bool = True
use_cache: bool = True
bos_token_id: int | None = 50256
eos_token_id: int | None = 50256
eos_token_id: int | list[int] | None = 50256
pad_token_id: int | None = None
scale_attn_by_inverse_layer_idx: bool = False
reorder_and_upcast_attn: bool = False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ class GPTBigCodeConfig(PreTrainedConfig):
scale_attn_weights: bool = True
use_cache: bool = True
bos_token_id: int | None = 50256
eos_token_id: int | None = 50256
eos_token_id: int | list[int] | None = 50256
pad_token_id: int | None = None
attention_softmax_in_fp32: bool = True
scale_attention_softmax_in_fp32: bool = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/gpt_neo/configuration_gpt_neo.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ class GPTNeoConfig(PreTrainedConfig):
initializer_range: float = 0.02
use_cache: bool = True
bos_token_id: int | None = 50256
eos_token_id: int | None = 50256
eos_token_id: int | list[int] | None = 50256
pad_token_id: int | None = None
tie_word_embeddings: bool = True

Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/gptj/configuration_gptj.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ class GPTJConfig(PreTrainedConfig):
initializer_range: float = 0.02
use_cache: bool = True
bos_token_id: int | None = 50256
eos_token_id: int | None = 50256
eos_token_id: int | list[int] | None = 50256
pad_token_id: int | None = None
tie_word_embeddings: bool = False

Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/groupvit/configuration_groupvit.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class GroupViTTextConfig(PreTrainedConfig):
initializer_factor: float = 1.0
pad_token_id: int | None = 1
bos_token_id: int | None = 49406
eos_token_id: int | None = 49407
eos_token_id: int | list[int] | None = 49407


@auto_docstring(checkpoint="nvidia/groupvit-gcc-yfcc")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ class HiggsAudioV2Config(PreTrainedConfig):
use_cache: bool = True
pad_token_id: int | None = 128001
bos_token_id: int | None = 1
eos_token_id: int | None = 128009
eos_token_id: int | list[int] | None = 128009
pretraining_tp: int | None = 1
tie_word_embeddings: bool = False
rope_parameters: RopeParameters | dict | None = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ class HiggsAudioV2Config(LlamaConfig):
num_attention_heads: int = 24
num_key_value_heads: int = 8
pad_token_id: int | None = 128001
eos_token_id: int | None = 128009
eos_token_id: int | list[int] | None = 128009
head_dim: int | None = 128
num_codebooks: int = 8
codebook_size: int = 1024
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/hubert/configuration_hubert.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ class HubertConfig(PreTrainedConfig):
classifier_proj_size: int = 256
pad_token_id: int | None = 0
bos_token_id: int | None = 1
eos_token_id: int | None = 2
eos_token_id: int | list[int] | None = 2

def __post_init__(self, **kwargs):
self.num_feat_extract_layers = len(self.conv_dim)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/ibert/configuration_ibert.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class IBertConfig(PreTrainedConfig):
layer_norm_eps: float = 1e-12
pad_token_id: int | None = 1
bos_token_id: int | None = 0
eos_token_id: int | None = 2
eos_token_id: int | list[int] | None = 2
quant_mode: bool = False
force_dequant: str = "none"

Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/idefics/configuration_idefics.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ class IdeficsConfig(PreTrainedConfig):
use_cache: bool = True
pad_token_id: int | None = 0
bos_token_id: int | None = 1
eos_token_id: int | None = 2
eos_token_id: int | list[int] | None = 2
tie_word_embeddings: bool = False
cross_layer_interval: int = 1
qk_layer_norms: bool = False
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/jamba/configuration_jamba.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ class JambaConfig(PreTrainedConfig):
router_aux_loss_coef: float = 0.001
pad_token_id: int | None = 0
bos_token_id: int | None = 1
eos_token_id: int | None = 2
eos_token_id: int | list[int] | None = 2
max_position_embeddings: int = 262144
attention_dropout: float | int = 0.0
num_experts_per_tok: int = 2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ class JinaEmbeddingsV3Config(PreTrainedConfig):
layer_norm_eps: float = 1e-5
pad_token_id: int | None = 1
bos_token_id: int | None = 0
eos_token_id: int | None = 2
eos_token_id: int | list[int] | None = 2
use_cache: bool = True
classifier_dropout: float | int | None = None
tie_word_embeddings: bool = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/kosmos2/configuration_kosmos2.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ class Kosmos2TextConfig(PreTrainedConfig):
use_cache: bool = True
pad_token_id: int | None = 1
bos_token_id: int | None = 0
eos_token_id: int | None = 2
eos_token_id: int | list[int] | None = 2
add_cross_attention: bool = False


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ class Kosmos2_5TextConfig(PreTrainedConfig):
use_cache: bool = True
pad_token_id: int | None = 1
bos_token_id: int | None = 0
eos_token_id: int | None = 2
eos_token_id: int | list[int] | None = 2


@auto_docstring(checkpoint="microsoft/kosmos-2.5")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ class LayoutLMv3Config(PreTrainedConfig):
layer_norm_eps: float = 1e-5
pad_token_id: int | None = 1
bos_token_id: int | None = 0
eos_token_id: int | None = 2
eos_token_id: int | list[int] | None = 2
max_2d_position_embeddings: int = 1024
coordinate_size: int = 128
shape_size: int = 128
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/led/configuration_led.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ class LEDConfig(PreTrainedConfig):
classifier_dropout: float | int = 0.0
pad_token_id: int | None = 1
bos_token_id: int | None = 0
eos_token_id: int | None = 2
eos_token_id: int | list[int] | None = 2
attention_window: list[int] | int = 512
tie_word_embeddings: bool = True

Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/llama4/configuration_llama4.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ class Llama4TextConfig(PreTrainedConfig):
use_cache: bool = True
pad_token_id: int | None = None
bos_token_id: int | None = 1
eos_token_id: int | None = 2
eos_token_id: int | list[int] | None = 2
tie_word_embeddings: bool = False
attention_dropout: float | int = 0.0
num_experts_per_tok: int = 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ class LongcatFlashConfig(PreTrainedConfig):
attention_bias: bool = False
attention_dropout: float | int = 0.0
ffn_hidden_size: int = 12288
q_lora_rank: int = 1536
q_lora_rank: int | None = 1536
kv_lora_rank: int = 512
qk_nope_head_dim: int = 128
qk_rope_head_dim: int = 64
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/longt5/configuration_longt5.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class LongT5Config(PreTrainedConfig):
encoder_attention_type: str = "local"
use_cache: bool = True
pad_token_id: int | None = 0
eos_token_id: int | None = 1
eos_token_id: int | list[int] | None = 1
bos_token_id: int | None = None
is_decoder: bool = False
tie_word_embeddings: bool = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/luke/configuration_luke.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ class LukeConfig(PreTrainedConfig):
classifier_dropout: float | int | None = None
pad_token_id: int | None = 1
bos_token_id: int | None = 0
eos_token_id: int | None = 2
eos_token_id: int | list[int] | None = 2
tie_word_embeddings: bool = True


Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/m2m_100/configuration_m2m_100.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ class M2M100Config(PreTrainedConfig):
scale_embedding: int = True
pad_token_id: int | None = 1
bos_token_id: int | None = 0
eos_token_id: int | None = 2
eos_token_id: int | list[int] | None = 2
tie_word_embeddings: bool = True


Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/mamba2/configuration_mamba2.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ class Mamba2Config(PreTrainedConfig):
layer_norm_epsilon: float = 1e-5
pad_token_id: int | None = 1
bos_token_id: int | None = 0
eos_token_id: int | None = 2
eos_token_id: int | list[int] | None = 2
expand: int = 2
conv_kernel: int = 4
n_groups: int = 8
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/marian/configuration_marian.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,9 @@ class MarianConfig(PreTrainedConfig):
decoder_start_token_id: int = 58100
scale_embedding: bool = False
pad_token_id: int | None = 58100
eos_token_id: int | None = 0
eos_token_id: int | list[int] | None = 0
bos_token_id: int | None = None
forced_eos_token_id: int | None = 0
forced_eos_token_id: int | list[int] | None = 0
share_encoder_decoder_embeddings: bool = True
is_decoder: bool = False
tie_word_embeddings: bool = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/markuplm/configuration_markuplm.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ class MarkupLMConfig(PreTrainedConfig):
layer_norm_eps: float = 1e-12
pad_token_id: int | None = 0
bos_token_id: int | None = 0
eos_token_id: int | None = 2
eos_token_id: int | list[int] | None = 2
max_xpath_tag_unit_embeddings: int = 256
max_xpath_subs_unit_embeddings: int = 1024
tag_pad_id: int = 216
Expand Down
Loading
Loading