diff --git a/src/transformers/models/afmoe/configuration_afmoe.py b/src/transformers/models/afmoe/configuration_afmoe.py index 76336c07f486..3239e26f49ad 100644 --- a/src/transformers/models/afmoe/configuration_afmoe.py +++ b/src/transformers/models/afmoe/configuration_afmoe.py @@ -87,7 +87,7 @@ class AfmoeConfig(PreTrainedConfig): output_router_logits: bool = False global_attn_every_n_layers: int | None = 4 sliding_window: int | None = 1024 - layer_types: list | None = None + layer_types: list[str] | None = None attention_dropout: float | int | None = 0.0 mup_enabled: bool | None = False eos_token_id: int | list[int] | None = None diff --git a/src/transformers/models/llama4/configuration_llama4.py b/src/transformers/models/llama4/configuration_llama4.py index 09547826685d..e7258ff0f33a 100644 --- a/src/transformers/models/llama4/configuration_llama4.py +++ b/src/transformers/models/llama4/configuration_llama4.py @@ -166,7 +166,7 @@ class Llama4TextConfig(PreTrainedConfig): no_rope_layers: list[int] | None = None no_rope_layer_interval: int = 4 attention_chunk_size: int = 8192 - layer_types: list[int] | None = None + layer_types: list[str] | None = None attn_temperature_tuning: bool = True floor_scale: int = 8192 attn_scale: float = 0.1