Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
65 commits
Select commit Hold shift + click to select a range
a3a8726
remove from base
zucchini-nlp Oct 10, 2025
7569f17
delete
zucchini-nlp Oct 13, 2025
83db459
fetcher fix
zucchini-nlp Oct 13, 2025
66225ab
missing values
zucchini-nlp Oct 13, 2025
6d322fa
update
zucchini-nlp Oct 13, 2025
cd1c645
is decoder missing
zucchini-nlp Oct 14, 2025
a744da3
forgot to add
zucchini-nlp Oct 14, 2025
212e609
add special tokens with default `None` in text models
zucchini-nlp Oct 14, 2025
c45264c
fsmt has unused subconfig, fix it!
zucchini-nlp Oct 14, 2025
589a776
update
zucchini-nlp Oct 14, 2025
cb03af1
Merge branch 'main' into config-inheritance
zucchini-nlp Oct 15, 2025
a7ea9dc
Merge branch 'main' into config-inheritance
zucchini-nlp Oct 15, 2025
9520541
fix
zucchini-nlp Oct 15, 2025
a94cd75
add missig token id defaults
zucchini-nlp Oct 15, 2025
338558c
fix more tests
zucchini-nlp Oct 16, 2025
0e6f6f7
tie_word_embeddings
zucchini-nlp Oct 16, 2025
fb0c58d
tiny fixes
zucchini-nlp Oct 17, 2025
05699a7
more test fixes
zucchini-nlp Oct 17, 2025
a913528
fix docstrings
zucchini-nlp Oct 17, 2025
87e610d
fix copies
zucchini-nlp Oct 17, 2025
ad1930e
fix style?
zucchini-nlp Oct 17, 2025
57b1736
rebase main
zucchini-nlp Oct 17, 2025
74a4a46
fix copied again
zucchini-nlp Oct 17, 2025
9afe474
merge main
zucchini-nlp Oct 17, 2025
f79588e
fix copies
zucchini-nlp Oct 17, 2025
7d3c3cf
fix examples
zucchini-nlp Oct 17, 2025
6565152
delete left over print stmt
zucchini-nlp Oct 17, 2025
69efa1f
Merge branch 'main' into config-inheritance
zucchini-nlp Nov 6, 2025
d909306
splitnter
zucchini-nlp Nov 6, 2025
796b312
Merge branch 'main' into config-inheritance
zucchini-nlp Nov 14, 2025
3d01b44
this defi will fix a bunch decoder-only models
zucchini-nlp Nov 17, 2025
d696e05
it's gonna be so much fun to fix issues after refactors on main...
zucchini-nlp Dec 8, 2025
9c87fd5
make style
zucchini-nlp Dec 8, 2025
3a41439
fix copies
zucchini-nlp Dec 8, 2025
c831852
WTF, I rebased 5 min ago?!
zucchini-nlp Dec 8, 2025
83bc532
not all models are supposed to have an attr for `tie_word_embeddings`!
zucchini-nlp Dec 8, 2025
1be91a1
merge main
zucchini-nlp Dec 22, 2025
f034540
comment out
zucchini-nlp Dec 22, 2025
5f803ff
fix
zucchini-nlp Dec 22, 2025
4ef5d93
more fixes
zucchini-nlp Dec 22, 2025
73d8d24
fix copies
zucchini-nlp Dec 22, 2025
137493b
docstring and non-model tests
zucchini-nlp Dec 22, 2025
c1f0aae
update
zucchini-nlp Dec 22, 2025
b8ed5b3
fix repo consistency
zucchini-nlp Dec 22, 2025
be62176
merge main
zucchini-nlp Jan 8, 2026
e3333fb
style
zucchini-nlp Jan 8, 2026
2fba81a
fix
zucchini-nlp Jan 8, 2026
2cc234a
Merge branch 'main' into config-inheritance
zucchini-nlp Jan 8, 2026
68545e7
revert
zucchini-nlp Jan 12, 2026
233c986
Merge branch 'main' into config-inheritance
zucchini-nlp Jan 12, 2026
2fd964b
remove unused attr
zucchini-nlp Jan 12, 2026
66dd842
fix repo
zucchini-nlp Jan 12, 2026
fbe85de
fix test
zucchini-nlp Jan 12, 2026
1391a5e
Merge branch 'main' into config-inheritance
zucchini-nlp Jan 12, 2026
f76536e
fix a few tests, more tests
zucchini-nlp Jan 12, 2026
4fdf142
fix gemma & llava
zucchini-nlp Jan 13, 2026
d046c0f
style
zucchini-nlp Jan 13, 2026
6c6e720
gemma3n also
zucchini-nlp Jan 13, 2026
9fe2176
Merge branch 'main commit c0d2e26f' into config-inheritance
ydshieh Jan 13, 2026
2d4da5f
merge main
zucchini-nlp Jan 15, 2026
840e8ea
new models as well
zucchini-nlp Jan 15, 2026
5cc58bd
skip the test
zucchini-nlp Jan 15, 2026
9909482
Merge branch 'main' into config-inheritance
zucchini-nlp Jan 15, 2026
b2c7337
Merge branch 'main' into config-inheritance
zucchini-nlp Jan 15, 2026
0e9d3d2
Merge branch 'main' into config-inheritance
zucchini-nlp Jan 16, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
Original file line number Diff line number Diff line change
Expand Up @@ -562,11 +562,9 @@ def remove_special_characters(batch):
with open(vocab_file, "w") as file:
json.dump(vocab_dict, file)

# if tokenizer has just been created
# it is defined by `tokenizer_class` if present in config else by `model_type`
tokenizer_kwargs = {
"config": config if config.tokenizer_class is not None else None,
"tokenizer_type": (config.model_type if config.tokenizer_class is None else None),
"config": config,
"tokenizer_type": config.model_type,
"unk_token": unk_token,
"pad_token": pad_token,
"word_delimiter_token": word_delimiter_token,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -566,11 +566,9 @@ def remove_special_characters(batch):
with open(vocab_file, "w") as file:
json.dump(vocab_dict, file)

# if tokenizer has just been created
# it is defined by `tokenizer_class` if present in config else by `model_type`
tokenizer_kwargs = {
"config": config if config.tokenizer_class is not None else None,
"tokenizer_type": (config.model_type if config.tokenizer_class is None else None),
"config": config,
"tokenizer_type": config.model_type,
"unk_token": unk_token,
"pad_token": pad_token,
"word_delimiter_token": word_delimiter_token,
Expand Down
68 changes: 0 additions & 68 deletions src/transformers/configuration_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,16 +114,6 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
Whether or not the model should return a [`~transformers.utils.ModelOutput`] instead of a plain tuple.
is_encoder_decoder (`bool`, *optional*, defaults to `False`):
Whether the model is used as an encoder/decoder or not.
is_decoder (`bool`, *optional*, defaults to `False`):
Whether to only use the decoder in an encoder-decoder architecture, otherwise it has no effect on
decoder-only or encoder-only architectures.
cross_attention_hidden_size (`bool`, *optional*):
The hidden size of the cross-attention layer in case the model is used as a decoder in an encoder-decoder
setting and the cross-attention hidden dimension differs from `self.config.hidden_size`.
add_cross_attention (`bool`, *optional*, defaults to `False`):
Whether cross-attention layers should be added to the model. Note, this option is only relevant for models
that can be used as decoder models within the [`EncoderDecoderModel`] class, which consists of all models
in `AUTO_MODELS_FOR_CAUSAL_LM`.
chunk_size_feed_forward (`int`, *optional*, defaults to `0`):
The chunk size of all feed forward layers in the residual attention blocks. A chunk size of `0` means that
the feed forward layer is not chunked. A chunk size of n means that the feed forward layer processes `n` <
Expand All @@ -134,43 +124,18 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):

architectures (`list[str]`, *optional*):
Model architectures that can be used with the model pretrained weights.
finetuning_task (`str`, *optional*):
Name of the task used to fine-tune the model.
id2label (`dict[int, str]`, *optional*):
A map from index (for instance prediction index, or target index) to label.
label2id (`dict[str, int]`, *optional*):
A map from label to index for the model.
num_labels (`int`, *optional*):
Number of labels to use in the last layer added to the model, typically for a classification task.
task_specific_params (`dict[str, Any]`, *optional*):
Additional keyword arguments to store for the current task.
problem_type (`str`, *optional*):
Problem type for `XxxForSequenceClassification` models. Can be one of `"regression"`,
`"single_label_classification"` or `"multi_label_classification"`.

> Parameters linked to the tokenizer

tokenizer_class (`str`, *optional*):
The name of the associated tokenizer class to use (if none is set, will use the tokenizer associated to the
model by default).
prefix (`str`, *optional*):
A specific prompt that should be added at the beginning of each text before calling the model.
bos_token_id (`int`, *optional*):
The id of the _beginning-of-stream_ token.
pad_token_id (`int`, *optional*):
The id of the _padding_ token.
eos_token_id (`int`, *optional*):
The id of the _end-of-stream_ token.
decoder_start_token_id (`int`, *optional*):
If an encoder-decoder model starts decoding with a different token than _bos_, the id of that token.
sep_token_id (`int`, *optional*):
The id of the _separation_ token.

> PyTorch specific parameters

tie_word_embeddings (`bool`, *optional*, defaults to `True`):
Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
model has a output word embedding layer.
dtype (`str`, *optional*):
The `dtype` of the weights. This attribute can be used to initialize the model to a non-default `dtype`
(which is normally `float32`) and thus allow for optimal storage allocation. For example, if the saved
Expand Down Expand Up @@ -207,28 +172,14 @@ def __init__(
return_dict: bool = True,
dtype: Union[str, "torch.dtype"] | None = None,
# Common arguments
tie_word_embeddings: bool = True,
chunk_size_feed_forward: int = 0,
is_encoder_decoder: bool = False,
is_decoder: bool = False,
cross_attention_hidden_size: int | None = None,
add_cross_attention: bool = False,
# Fine-tuning task arguments
architectures: list[str] | None = None,
finetuning_task: str | None = None,
id2label: dict[int, str] | None = None,
label2id: dict[str, int] | None = None,
num_labels: int | None = None,
task_specific_params: dict[str, Any] | None = None,
problem_type: str | None = None,
# Tokenizer kwargs
tokenizer_class: str | None = None,
prefix: str | None = None,
bos_token_id: int | None = None,
pad_token_id: int | None = None,
eos_token_id: int | None = None,
sep_token_id: int | None = None,
decoder_start_token_id: int | None = None,
**kwargs,
):
# Validation for some arguments
Expand Down Expand Up @@ -276,25 +227,15 @@ def __init__(
self._output_attentions = output_attentions # has public property

# Less common kwargs, only used by some models
if "tie_encoder_decoder" in kwargs:
tie_encoder_decoder = kwargs.pop("tie_encoder_decoder")
tie_word_embeddings = tie_encoder_decoder or tie_word_embeddings

Comment on lines -279 to -282
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am deleting tie_encoder_decoder. Can't find any model on the hub that uses it and the attr was added only for the custom EncoderDecoderModel class. However we can't tie encoder to decoder in EncoderDecoderModel anymore, no matter what is the value of this attribute

self.tie_word_embeddings = tie_word_embeddings
self.chunk_size_feed_forward = chunk_size_feed_forward

# Encoder-decoder models attributes
self.is_encoder_decoder = is_encoder_decoder
self.is_decoder = is_decoder # used in encoder-decoder models to differentiate encoder from decoder
self.cross_attention_hidden_size = cross_attention_hidden_size
self.add_cross_attention = add_cross_attention

# Fine-tuning task attributes
self.architectures = architectures
self.finetuning_task = finetuning_task
self.id2label = id2label
self.label2id = label2id
self.task_specific_params = task_specific_params
self.problem_type = problem_type

if self.id2label is None:
Expand All @@ -303,15 +244,6 @@ def __init__(
# Keys are always strings in JSON so convert ids to int here.
self.id2label = {int(key): value for key, value in self.id2label.items()}

# Tokenizer attributes
self.tokenizer_class = tokenizer_class
self.prefix = prefix
self.bos_token_id = bos_token_id
self.pad_token_id = pad_token_id
self.eos_token_id = eos_token_id
self.sep_token_id = sep_token_id
self.decoder_start_token_id = decoder_start_token_id

# Parameters for sequence generation saved in the config are popped instead of loading them.
for parameter_name in GenerationConfig._get_default_generation_params().keys():
kwargs.pop(parameter_name, None)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/integrations/executorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -994,7 +994,7 @@ def generate(self, prompt_token_ids, max_new_tokens):
decoder_input_ids = torch.tensor([[next_token]], dtype=torch.long, device=model_device)

# Check if EOS token
if next_token == self.config.eos_token_id:
if next_token == self.generation_config.eos_token_id:
break

return generated_ids
Expand Down
18 changes: 10 additions & 8 deletions src/transformers/modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -910,7 +910,7 @@ def get_extended_attention_mask(
# Provided a padding mask of dimensions [batch_size, seq_length]
# - if the model is a decoder, apply a causal mask in addition to the padding mask
# - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
if self.config.is_decoder:
if getattr(self.config, "is_decoder", None):
extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder(
input_shape, attention_mask
)
Expand Down Expand Up @@ -2392,7 +2392,10 @@ def get_expanded_tied_weights_keys(self, all_submodels: bool = False) -> dict:

tied_mapping = self._tied_weights_keys
# If the config does not specify any tying, return empty dict
if not self.config.tie_word_embeddings:
# NOTE: not all modules have `tie_word_embeddings` attr, for example vision-only
# modules do not have any word embeddings!
tie_word_embeddings = getattr(self.config, "tie_word_embeddings", False)
if not tie_word_embeddings:
Comment on lines +2395 to +2398
Copy link
Member Author

@zucchini-nlp zucchini-nlp Dec 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

important! Please review here. There are two big changes:

  1. we don't assume that tie_word_embeddings on all configs. Some models have no input-output word embeddings and thus nothing to tie (e.g. ViT)
  2. We check for the text config here. In multimodals we usually have tie_word_embeddings under text config but the text module will not have a class attr _tied_weights_keys. For ex: Paligemma might be tying weights via its text config but GemmaModel is a base model with no lm head. So we will not execute the rest of code and early exit

return {}
# If None, return empty dict
elif tied_mapping is None:
Expand Down Expand Up @@ -2642,10 +2645,7 @@ def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None, mean
new_num_tokens = new_embeddings.weight.shape[0]

# if word embeddings are not tied, make sure that lm head is resized as well
if (
self.get_output_embeddings() is not None
and not self.config.get_text_config(decoder=True).tie_word_embeddings
):
if self.get_output_embeddings() is not None:
old_lm_head = self.get_output_embeddings()
if isinstance(old_lm_head, torch.nn.Embedding):
new_lm_head = self._get_resized_embeddings(old_lm_head, new_num_tokens, mean_resizing=mean_resizing)
Expand Down Expand Up @@ -4302,15 +4302,17 @@ def warn_if_padding_and_no_attention_mask(self, input_ids, attention_mask):

# If the pad token is equal to either BOS, EOS, or SEP, we do not know whether the user should use an
# attention_mask or not. In this case, we should still show a warning because this is a rare case.
# NOTE: `sep_token_id` is not used in all models and it can be absent in the config
sep_token_id = getattr(self.config, "sep_token_id", None)
if (
(self.config.bos_token_id is not None and self.config.bos_token_id == self.config.pad_token_id)
or (self.config.eos_token_id is not None and self.config.eos_token_id == self.config.pad_token_id)
or (self.config.sep_token_id is not None and self.config.sep_token_id == self.config.pad_token_id)
or (sep_token_id is not None and sep_token_id == self.config.pad_token_id)
):
warn_string += (
f"\nYou may ignore this warning if your `pad_token_id` ({self.config.pad_token_id}) is identical "
f"to the `bos_token_id` ({self.config.bos_token_id}), `eos_token_id` ({self.config.eos_token_id}), "
f"or the `sep_token_id` ({self.config.sep_token_id}), and your input is not padded."
f"or the `sep_token_id` ({sep_token_id}), and your input is not padded."
)

logger.warning_once(warn_string)
Expand Down
18 changes: 14 additions & 4 deletions src/transformers/models/afmoe/configuration_afmoe.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,12 @@ class AfmoeConfig(PreTrainedConfig):
mup_enabled (`bool`, *optional*, defaults to `False`):
Whether to enable muP (Maximal Update Parametrization) input scaling. When enabled, input embeddings
are scaled by `sqrt(hidden_size)`.
eos_token_id (`int`, *optional*):
End of stream token id.
pad_token_id (`int`, *optional*):
Padding token id.
bos_token_id (`int`, *optional*):
Beginning of stream token id.

Example:
```python
Expand Down Expand Up @@ -155,6 +161,9 @@ def __init__(
layer_types: list | None = None,
attention_dropout: float | None = 0.0,
mup_enabled: bool | None = False,
eos_token_id: bool | None = None,
pad_token_id: bool | None = None,
bos_token_id: bool | None = None,
**kwargs,
):
self.vocab_size = vocab_size
Expand Down Expand Up @@ -197,11 +206,12 @@ def __init__(
num_key_value_heads = num_attention_heads

self.num_key_value_heads = num_key_value_heads
self.eos_token_id = eos_token_id
self.pad_token_id = pad_token_id
self.bos_token_id = bos_token_id
self.tie_word_embeddings = tie_word_embeddings

super().__init__(
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
super().__init__(**kwargs)


__all__ = ["AfmoeConfig"]
9 changes: 2 additions & 7 deletions src/transformers/models/aimv2/configuration_aimv2.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,10 +157,6 @@ class Aimv2TextConfig(PreTrainedConfig):
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
pad_token_id (`int`, *optional*, defaults to 1):
The id of the padding token in the vocabulary.
bos_token_id (`int`, *optional*, defaults to 49406):
The id of the beginning-of-sequence token in the vocabulary.
eos_token_id (`int`, *optional*, defaults to 49407):
The id of the end-of-sequence token in the vocabulary.
max_position_embeddings (`int`, *optional*, defaults to 77):
Expand All @@ -185,14 +181,13 @@ def __init__(
qkv_bias: bool = False,
mlp_bias: bool = False,
hidden_act: str = "silu",
pad_token_id: int | None = None,
bos_token_id: int | None = None,
eos_token_id: int = 49407,
max_position_embeddings: int = 77,
initializer_range: bool = 0.02,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
super().__init__(**kwargs)
self.eos_token_id = eos_token_id

self.vocab_size = vocab_size
self.hidden_size = hidden_size
Expand Down
8 changes: 0 additions & 8 deletions src/transformers/models/aimv2/modular_aimv2.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,10 +171,6 @@ class Aimv2TextConfig(SiglipTextConfig):
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
pad_token_id (`int`, *optional*, defaults to 1):
The id of the padding token in the vocabulary.
bos_token_id (`int`, *optional*, defaults to 49406):
The id of the beginning-of-sequence token in the vocabulary.
eos_token_id (`int`, *optional*, defaults to 49407):
The id of the end-of-sequence token in the vocabulary.
max_position_embeddings (`int`, *optional*, defaults to 77):
Expand All @@ -196,8 +192,6 @@ def __init__(
qkv_bias: bool = False,
mlp_bias: bool = False,
hidden_act: str = "silu",
pad_token_id: int | None = None,
bos_token_id: int | None = None,
eos_token_id: int = 49407,
max_position_embeddings: int = 77,
initializer_range: bool = 0.02,
Expand All @@ -211,8 +205,6 @@ def __init__(
num_attention_heads=num_attention_heads,
hidden_act=hidden_act,
max_position_embeddings=max_position_embeddings,
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
**kwargs,
)
Expand Down
9 changes: 8 additions & 1 deletion src/transformers/models/albert/configuration_albert.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ class AlbertConfig(PreTrainedConfig):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 3):
End of stream token id.
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
Whether to tie weight embeddings

Examples:

Expand Down Expand Up @@ -115,9 +117,14 @@ def __init__(
pad_token_id=0,
bos_token_id=2,
eos_token_id=3,
tie_word_embeddings=True,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
super().__init__(**kwargs)
self.pad_token_id = pad_token_id
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.tie_word_embeddings = tie_word_embeddings

self.vocab_size = vocab_size
self.embedding_size = embedding_size
Expand Down
13 changes: 8 additions & 5 deletions src/transformers/models/align/configuration_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,10 @@ class AlignTextConfig(PreTrainedConfig):
The epsilon used by the layer normalization layers.
pad_token_id (`int`, *optional*, defaults to 0):
Padding token id.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
bos_token_id (`int`, *optional*):
Beginning of stream token id.
eos_token_id (`int`, *optional*):
End of stream token id.

Example:

Expand Down Expand Up @@ -98,7 +99,8 @@ def __init__(
initializer_range=0.02,
layer_norm_eps=1e-12,
pad_token_id=0,
use_cache=True,
bos_token_id=None,
eos_token_id=None,
**kwargs,
):
super().__init__(**kwargs)
Expand All @@ -115,8 +117,9 @@ def __init__(
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.use_cache = use_cache
self.pad_token_id = pad_token_id
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id


class AlignVisionConfig(PreTrainedConfig):
Expand Down
Loading