diff --git a/examples/modular-transformers/configuration_duplicated_method.py b/examples/modular-transformers/configuration_duplicated_method.py index 4f7d98c6d8d8..9b4e8d029266 100644 --- a/examples/modular-transformers/configuration_duplicated_method.py +++ b/examples/modular-transformers/configuration_duplicated_method.py @@ -5,79 +5,18 @@ # modular_duplicated_method.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters +from ...utils import auto_docstring +from ...utils.type_validators import interval +@auto_docstring(checkpoint="meta-duplicated_method/DuplicatedMethod-2-7b-hf") +@strict(accept_kwargs=True) class DuplicatedMethodConfig(PreTrainedConfig): r""" - This is the configuration class to store the configuration of a [`DuplicatedMethodModel`]. It is used to instantiate an DuplicatedMethod - model according to the specified arguments, defining the model architecture. Instantiating a configuration with the - defaults will yield a similar configuration to that of the DuplicatedMethod-7B. - e.g. [meta-duplicated_method/DuplicatedMethod-2-7b-hf](https://huggingface.co/meta-duplicated_method/DuplicatedMethod-2-7b-hf) - - Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PreTrainedConfig`] for more information. - - - Args: - vocab_size (`int`, *optional*, defaults to 32000): - Vocabulary size of the DuplicatedMethod model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`DuplicatedMethodModel`] - hidden_size (`int`, *optional*, defaults to 4096): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 11008): - Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 32): - Number of hidden layers in the Transformer decoder. - num_attention_heads (`int`, *optional*, defaults to 32): - Number of attention heads for each attention layer in the Transformer decoder. - num_key_value_heads (`int`, *optional*): - This is the number of key_value heads that should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details, check out [this - paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to - `num_attention_heads`. - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): - The non-linear activation function (function or string) in the decoder. - max_position_embeddings (`int`, *optional*, defaults to 2048): - The maximum sequence length that this model might ever be used with. DuplicatedMethod 1 supports up to 2048 tokens, - DuplicatedMethod 2 up to 4096, CodeLlama up to 16384. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - rms_norm_eps (`float`, *optional*, defaults to 1e-06): - The epsilon used by the rms normalization layers. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. - pad_token_id (`int`, *optional*): - Padding token id. - bos_token_id (`int`, *optional*, defaults to 1): - Beginning of stream token id. - eos_token_id (`int`, *optional*, defaults to 2): - End of stream token id. - pretraining_tp (`int`, *optional*, defaults to 1): - Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this - document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to - understand more about it. This value is necessary to ensure exact reproducibility of the pretraining - results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232). - tie_word_embeddings (`bool`, *optional*, defaults to `False`): - Whether to tie weight embeddings - rope_parameters (`RopeParameters`, *optional*): - Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain - a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE - with longer `max_position_embeddings`. - attention_bias (`bool`, *optional*, defaults to `False`): - Whether to use a bias in the query, key, value and output projection layers during self-attention. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - mlp_bias (`bool`, *optional*, defaults to `False`): - Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers. - head_dim (`int`, *optional*): - The attention head dimension. If None, it will default to hidden_size // num_attention_heads - ```python >>> from transformers import DuplicatedMethodModel, DuplicatedMethodConfig @@ -109,64 +48,46 @@ class DuplicatedMethodConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 32000, - hidden_size: int | None = 4096, - intermediate_size: int | None = 11008, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - pretraining_tp: int | None = 1, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - mlp_bias: bool | None = False, - head_dim: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.pretraining_tp = pretraining_tp - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.mlp_bias = mlp_bias - self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - self.rope_parameters = rope_parameters - - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) + vocab_size: int = 32000 + hidden_size: int = 4096 + intermediate_size: int = 11008 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 2048 + initializer_range: float = interval(min=0.0, max=1.0)(default=0.02) + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + pretraining_tp: int | None = 1 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: int | float | None = 0.0 + mlp_bias: bool = False + head_dim: int | None = None + + def __post_init__(self, **kwargs): + if self.head_dim is None: + self.head_dim = self.hidden_size // self.num_attention_heads + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention " + f"heads ({self.num_attention_heads})." + ) @property - def vocab_size(self): + def vocab_size(self): # noqa: F811 -> we need this at we cannot delete the original for now since config dataclass refactor return 45 @vocab_size.setter diff --git a/examples/modular-transformers/configuration_my_new_model.py b/examples/modular-transformers/configuration_my_new_model.py index 2df6a66f1864..a4c369fca146 100644 --- a/examples/modular-transformers/configuration_my_new_model.py +++ b/examples/modular-transformers/configuration_my_new_model.py @@ -5,10 +5,16 @@ # modular_my_new_model.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters +from ...utils import auto_docstring +from ...utils.type_validators import interval +@auto_docstring(checkpoint="meta-my_new_model/MyNewModel-2-7b-hf") +@strict(accept_kwargs=True) class MyNewModelConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`MyNewModelModel`]. It is used to instantiate an MyNewModel @@ -145,60 +151,42 @@ class MyNewModelConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 32000, - hidden_size: int | None = 4096, - intermediate_size: int | None = 11008, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - pretraining_tp: int | None = 1, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - mlp_bias=True, - head_dim: int | None = None, - new_param=0, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.pretraining_tp = pretraining_tp - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.mlp_bias = mlp_bias - self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - self.rope_parameters = rope_parameters - - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - self.new_param = new_param + vocab_size: int = 32000 + hidden_size: int = 4096 + intermediate_size: int = 11008 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 2048 + initializer_range: float = interval(min=0.0, max=1.0)(default=0.02) + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + pretraining_tp: int | None = 1 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: int | float | None = 0.0 + + mlp_bias: bool = True + head_dim: int | None = None + new_param: int = 0 + + def __post_init__(self, **kwargs): + if self.head_dim is None: + self.head_dim = self.hidden_size // self.num_attention_heads + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + super().__post_init__(**kwargs) + + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention " + f"heads ({self.num_attention_heads})." + ) diff --git a/examples/modular-transformers/configuration_my_new_model2.py b/examples/modular-transformers/configuration_my_new_model2.py index 85875a92484d..eda7a1c1b7c2 100644 --- a/examples/modular-transformers/configuration_my_new_model2.py +++ b/examples/modular-transformers/configuration_my_new_model2.py @@ -4,11 +4,16 @@ # the file from the modular. If any change should be done, please apply the change to the # modular_my_new_model2.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +from huggingface_hub.dataclasses import strict from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters +from ...utils import auto_docstring +from ...utils.type_validators import interval +@auto_docstring(checkpoint="meta-my_new_model2/MyNewModel2-2-7b-hf") +@strict(accept_kwargs=True) class MyNewModel2Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`GemmaModel`]. It is used to instantiate an Gemma @@ -49,58 +54,40 @@ class MyNewModel2Config(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size: int | None = 32000, - hidden_size: int | None = 4096, - intermediate_size: int | None = 11008, - num_hidden_layers: int | None = 32, - num_attention_heads: int | None = 32, - num_key_value_heads: int | None = None, - hidden_act: str | None = "silu", - max_position_embeddings: int | None = 2048, - initializer_range: float | None = 0.02, - rms_norm_eps: int | None = 1e-6, - use_cache: bool | None = True, - pad_token_id: int | None = None, - bos_token_id: int | None = 1, - eos_token_id: int | None = 2, - pretraining_tp: int | None = 1, - tie_word_embeddings: bool | None = False, - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, - attention_bias: bool | None = False, - attention_dropout: float | None = 0.0, - mlp_bias: bool | None = False, - head_dim: int | None = None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads + vocab_size: int = 32000 + hidden_size: int = 4096 + intermediate_size: int = 11008 + num_hidden_layers: int = 32 + num_attention_heads: int = 32 + num_key_value_heads: int | None = None + hidden_act: str = "silu" + max_position_embeddings: int = 2048 + initializer_range: float = interval(min=0.0, max=1.0)(default=0.02) + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int | None = None + bos_token_id: int | None = 1 + eos_token_id: int | list[int] | None = 2 + pretraining_tp: int | None = 1 + tie_word_embeddings: bool = False + rope_parameters: RopeParameters | dict | None = None + attention_bias: bool = False + attention_dropout: int | float | None = 0.0 + mlp_bias: bool = False + head_dim: int | None = None - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads + def __post_init__(self, **kwargs): + if self.head_dim is None: + self.head_dim = self.hidden_size // self.num_attention_heads + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.pretraining_tp = pretraining_tp - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.mlp_bias = mlp_bias - self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - self.rope_parameters = rope_parameters + super().__post_init__(**kwargs) - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) + def validate_architecture(self): + """Part of `@strict`-powered validation. Validates the architecture of the config.""" + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention " + f"heads ({self.num_attention_heads})." + ) diff --git a/examples/modular-transformers/configuration_new_model.py b/examples/modular-transformers/configuration_new_model.py index f966084a6595..5213682da45a 100644 --- a/examples/modular-transformers/configuration_new_model.py +++ b/examples/modular-transformers/configuration_new_model.py @@ -6,69 +6,18 @@ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # Example where we only want to overwrite the defaults of an init +from huggingface_hub.dataclasses import strict + from ...configuration_utils import PreTrainedConfig +from ...utils import auto_docstring +@auto_docstring(checkpoint="google/new_model-7b") +@strict(accept_kwargs=True) class NewModelConfig(PreTrainedConfig): r""" - This is the configuration class to store the configuration of a [`NewModelModel`]. It is used to instantiate an NewModel - model according to the specified arguments, defining the model architecture. Instantiating a configuration with the - defaults will yield a similar configuration to that of the NewModel-7B. - e.g. [google/new_model-7b](https://huggingface.co/google/new_model-7b) - Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PreTrainedConfig`] for more information. - - Args: - vocab_size (`int`, *optional*, defaults to 256000): - Vocabulary size of the NewModel model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`NewModelModel`] - hidden_size (`int`, *optional*, defaults to 3072): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 24576): - Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 28): - Number of hidden layers in the Transformer decoder. - num_attention_heads (`int`, *optional*, defaults to 16): - Number of attention heads for each attention layer in the Transformer decoder. - num_key_value_heads (`int`, *optional*, defaults to 16): - This is the number of key_value heads that should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details, check out [this - paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to - `num_attention_heads`. - head_dim (`int`, *optional*, defaults to 256): - The attention head dimension. - hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`): - The legacy activation function. It is overwritten by the `hidden_activation`. - max_position_embeddings (`int`, *optional*, defaults to 8192): - The maximum sequence length that this model might ever be used with. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - rms_norm_eps (`float`, *optional*, defaults to 1e-06): - The epsilon used by the rms normalization layers. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. - pad_token_id (`int`, *optional*, defaults to 0): - Padding token id. - eos_token_id (`int`, *optional*, defaults to 1): - End of stream token id. - bos_token_id (`int`, *optional*, defaults to 2): - Beginning of stream token id. - tie_word_embeddings (`bool`, *optional*, defaults to `True`): - Whether to tie weight embeddings - rope_parameters (`RopeParameters`, *optional*): - Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain - a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE - with longer `max_position_embeddings`. - attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): - Whether to use a bias in the query, key, value and output projection layers during self-attention. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - use_bidirectional_attention (`bool`, *optional*): - If True, the model will attend to all text tokens instead of using a causal mask. + use_bidirectional_attention (`bool`, *optional*): + If True, the model will attend to all text tokens instead of using a causal mask. ```python >>> from transformers import NewModelModel, NewModelConfig @@ -97,55 +46,27 @@ class NewModelConfig(PreTrainedConfig): "norm": (["hidden_states"], ["hidden_states"]), } - def __init__( - self, - vocab_size=256030, - hidden_size=64, - intermediate_size=90, - num_hidden_layers=28, - num_attention_heads=16, - num_key_value_heads=16, - head_dim=256, - hidden_act="gelu_pytorch_tanh", - hidden_activation=None, - max_position_embeddings=1500, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=0, - eos_token_id=1, - bos_token_id=2, - tie_word_embeddings=True, - rope_parameters=None, - attention_bias=False, - attention_dropout=0.0, - use_bidirectional_attention=False, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.head_dim = head_dim - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.use_bidirectional_attention = use_bidirectional_attention - self.rope_parameters = rope_parameters - - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) + vocab_size: int = 256030 + hidden_size: int = 64 + intermediate_size: int = 90 + num_hidden_layers: int = 28 + num_attention_heads: int = 16 + num_key_value_heads: int = 16 + head_dim: int = 256 + hidden_act: str = "gelu_pytorch_tanh" + max_position_embeddings: int = 1500 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int = 0 + eos_token_id: int = 1 + bos_token_id: int = 2 + tie_word_embeddings: bool = True + rope_parameters: dict | None = None + attention_bias: bool = False + attention_dropout: float = 0.0 + use_bidirectional_attention: bool = False + hidden_activation: str | None = None @property def num_heads(self): diff --git a/examples/modular-transformers/modeling_dummy_bert.py b/examples/modular-transformers/modeling_dummy_bert.py index 35c12af1dfb9..3a8ec4d237f5 100644 --- a/examples/modular-transformers/modeling_dummy_bert.py +++ b/examples/modular-transformers/modeling_dummy_bert.py @@ -103,7 +103,6 @@ def eager_attention_forward( attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling if attention_mask is not None: - attention_mask = attention_mask[:, :, :, : key.shape[-2]] attn_weights = attn_weights + attention_mask attn_weights = nn.functional.softmax(attn_weights, dim=-1) @@ -145,7 +144,6 @@ def forward( hidden_states: torch.Tensor, attention_mask: torch.FloatTensor | None = None, past_key_values: Cache | None = None, - cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor]: input_shape = hidden_states.shape[:-1] @@ -163,12 +161,7 @@ def forward( current_past_key_values = past_key_values.self_attention_cache # save all key/value_layer to cache to be re-used for fast auto-regressive generation - key_layer, value_layer = current_past_key_values.update( - key_layer, - value_layer, - self.layer_idx, - {"cache_position": cache_position}, - ) + key_layer, value_layer = current_past_key_values.update(key_layer, value_layer, self.layer_idx) attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface( self.config._attn_implementation, eager_attention_forward @@ -294,7 +287,6 @@ def forward( encoder_hidden_states: torch.FloatTensor | None = None, encoder_attention_mask: torch.FloatTensor | None = None, past_key_values: Cache | None = None, - cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor]: attention_mask = attention_mask if not self.is_cross_attention else encoder_attention_mask @@ -303,7 +295,6 @@ def forward( encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, past_key_values=past_key_values, - cache_position=cache_position, **kwargs, ) attention_output = self.output(attention_output, hidden_states) @@ -366,14 +357,12 @@ def forward( encoder_hidden_states: torch.FloatTensor | None = None, encoder_attention_mask: torch.FloatTensor | None = None, past_key_values: Cache | None = None, - cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor]: + ) -> torch.Tensor: self_attention_output, _ = self.attention( hidden_states, attention_mask, past_key_values=past_key_values, - cache_position=cache_position, **kwargs, ) attention_output = self_attention_output @@ -420,7 +409,6 @@ def forward( encoder_attention_mask: torch.FloatTensor | None = None, past_key_values: Cache | None = None, use_cache: bool | None = None, - cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor] | BaseModelOutputWithPastAndCrossAttentions: for i, layer_module in enumerate(self.layer): @@ -430,7 +418,6 @@ def forward( encoder_hidden_states, # as a positional argument for gradient checkpointing encoder_attention_mask=encoder_attention_mask, past_key_values=past_key_values, - cache_position=cache_position, **kwargs, ) @@ -572,6 +559,9 @@ def forward( cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: @@ -584,19 +574,7 @@ def forward( else DynamicCache(config=self.config) ) - if (input_ids is None) ^ (inputs_embeds is not None): - raise ValueError("You must specify exactly one of input_ids or inputs_embeds") - - if input_ids is not None: - device = input_ids.device - seq_length = input_ids.shape[1] - else: - device = inputs_embeds.device - seq_length = inputs_embeds.shape[1] - past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 - if cache_position is None: - cache_position = torch.arange(past_key_values_length, past_key_values_length + seq_length, device=device) embedding_output = self.embeddings( input_ids=input_ids, @@ -611,7 +589,6 @@ def forward( encoder_attention_mask=encoder_attention_mask, embedding_output=embedding_output, encoder_hidden_states=encoder_hidden_states, - cache_position=cache_position, past_key_values=past_key_values, ) @@ -622,7 +599,6 @@ def forward( encoder_attention_mask=encoder_attention_mask, past_key_values=past_key_values, use_cache=use_cache, - cache_position=cache_position, position_ids=position_ids, **kwargs, ) @@ -641,28 +617,26 @@ def _create_attention_masks( encoder_attention_mask, embedding_output, encoder_hidden_states, - cache_position, past_key_values, ): if self.config.is_decoder: attention_mask = create_causal_mask( config=self.config, - input_embeds=embedding_output, + inputs_embeds=embedding_output, attention_mask=attention_mask, - cache_position=cache_position, past_key_values=past_key_values, ) else: attention_mask = create_bidirectional_mask( config=self.config, - input_embeds=embedding_output, + inputs_embeds=embedding_output, attention_mask=attention_mask, ) if encoder_attention_mask is not None: encoder_attention_mask = create_bidirectional_mask( config=self.config, - input_embeds=embedding_output, + inputs_embeds=embedding_output, attention_mask=encoder_attention_mask, encoder_hidden_states=encoder_hidden_states, ) diff --git a/examples/modular-transformers/modeling_from_uppercase_model.py b/examples/modular-transformers/modeling_from_uppercase_model.py index af4ea303306f..31b818cf3e80 100644 --- a/examples/modular-transformers/modeling_from_uppercase_model.py +++ b/examples/modular-transformers/modeling_from_uppercase_model.py @@ -48,11 +48,6 @@ def __init__(self, config: FromUppercaseModelVisionConfig | FromUppercaseModelTe self.embed_dim = config.hidden_size self.num_heads = config.num_attention_heads self.head_dim = self.embed_dim // self.num_heads - if self.head_dim * self.num_heads != self.embed_dim: - raise ValueError( - f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" - f" {self.num_heads})." - ) self.scale = self.head_dim**-0.5 self.dropout = config.attention_dropout self.is_causal = False diff --git a/examples/modular-transformers/modeling_global_indexing.py b/examples/modular-transformers/modeling_global_indexing.py index 3a4bdcbb6add..75a72a16c4dd 100644 --- a/examples/modular-transformers/modeling_global_indexing.py +++ b/examples/modular-transformers/modeling_global_indexing.py @@ -78,8 +78,7 @@ def eager_attention_forward( attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling if attention_mask is not None: - causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] - attn_weights = attn_weights + causal_mask + attn_weights = attn_weights + attention_mask attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) @@ -132,7 +131,6 @@ def forward( position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, attention_mask: torch.Tensor | None = None, past_key_values: Cache | None = None, - cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, torch.Tensor]: input_shape = hidden_states.shape[:-1] @@ -146,9 +144,7 @@ def forward( query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_values is not None: - # sin and cos are specific to RoPE models; cache_position needed for the static cache - cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} - key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx) attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface( self.config._attn_implementation, eager_attention_forward diff --git a/examples/modular-transformers/modeling_multimodal2.py b/examples/modular-transformers/modeling_multimodal2.py index 85cdcf3bd5d2..f15a440b7f03 100644 --- a/examples/modular-transformers/modeling_multimodal2.py +++ b/examples/modular-transformers/modeling_multimodal2.py @@ -10,8 +10,6 @@ import torch from torch import nn -from transformers.utils import add_start_docstrings - from ...activations import ACT2FN from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling @@ -53,11 +51,6 @@ def __init__(self, config: Multimodal2VisionConfig | Multimodal2TextConfig): self.embed_dim = config.hidden_size self.num_heads = config.num_attention_heads self.head_dim = self.embed_dim // self.num_heads - if self.head_dim * self.num_heads != self.embed_dim: - raise ValueError( - f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" - f" {self.num_heads})." - ) self.scale = self.head_dim**-0.5 self.dropout = config.attention_dropout self.is_causal = False @@ -202,6 +195,28 @@ def forward( ) +@auto_docstring +class Multimodal2VisionPreTrainedModel(PreTrainedModel): + config: Multimodal2Config + base_model_prefix = "multimodal2_vision" + input_modalities = ("image", "text") + supports_gradient_checkpointing = True + _supports_sdpa = True + _supports_flash_attn = True + _supports_flex_attn = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": Multimodal2VisionEncoderLayer, + "attentions": Multimodal2VisionAttention, + } + + @torch.no_grad() + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, Multimodal2VisionMLP): + pass + + class Multimodal2VisionEmbeddings(nn.Module): def __init__(self, config: Multimodal2VisionConfig): super().__init__() @@ -285,9 +300,14 @@ def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=Fals return embeddings -class Multimodal2VisionTransformer(nn.Module): +class Multimodal2VisionTransformer(Multimodal2VisionPreTrainedModel): + config: Multimodal2VisionConfig + main_input_name = "pixel_values" + input_modalities = ("image",) + _no_split_modules = ["CLIPEncoderLayer"] + def __init__(self, config): - super().__init__() + super().__init__(config) self.config = config embed_dim = config.hidden_size @@ -295,7 +315,10 @@ def __init__(self, config): self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) self.encoder = Multimodal2VisionEncoder(config) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + self.post_init() + @merge_with_config_defaults + @capture_outputs(tie_last_hidden_states=False) @auto_docstring def forward( self, @@ -324,32 +347,11 @@ def forward( ) -@auto_docstring -class Multimodal2VisionPreTrainedModel(PreTrainedModel): - config: Multimodal2Config - base_model_prefix = "multimodal2_vision" - input_modalities = ("image", "text") - supports_gradient_checkpointing = True - _supports_sdpa = True - _supports_flash_attn = True - _supports_flex_attn = True - _supports_attention_backend = True - _can_record_outputs = { - "hidden_states": Multimodal2VisionEncoderLayer, - "attentions": Multimodal2VisionAttention, - } - - @torch.no_grad() - def _init_weights(self, module): - """Initialize the weights""" - if isinstance(module, Multimodal2VisionMLP): - pass - - -MULTIMODAL2_VISION_START_DOCSTRING = "doc" - - -@add_start_docstrings("New doc", MULTIMODAL2_VISION_START_DOCSTRING) +@auto_docstring( + custom_intro=""" + The vision model from MULTIMODAL2 without any head or projection on top. + """ +) class Multimodal2VisionModel(Multimodal2VisionPreTrainedModel): config: Multimodal2VisionConfig main_input_name = "pixel_values" @@ -365,8 +367,6 @@ def __init__(self, config: Multimodal2VisionConfig): def get_input_embeddings(self) -> nn.Module: return self.vision_model.embeddings.patch_embedding - @merge_with_config_defaults - @capture_outputs(tie_last_hidden_states=False) @auto_docstring def forward( self, diff --git a/examples/modular-transformers/modeling_my_new_model2.py b/examples/modular-transformers/modeling_my_new_model2.py index 0fc26b94fdb7..2b6af9937d2d 100644 --- a/examples/modular-transformers/modeling_my_new_model2.py +++ b/examples/modular-transformers/modeling_my_new_model2.py @@ -20,6 +20,20 @@ from .configuration_my_new_model2 import MyNewModel2Config +class MyNewModel2TextScaledWordEmbedding(nn.Embedding): + """ + This module overrides nn.Embeddings' forward by multiplying with embeddings scale. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: float = 1.0): + super().__init__(num_embeddings, embedding_dim, padding_idx) + self.scalar_embed_scale = embed_scale + self.register_buffer("embed_scale", torch.tensor(embed_scale), persistent=False) + + def forward(self, input_ids: torch.Tensor): + return super().forward(input_ids) * self.embed_scale.to(self.weight.dtype) + + class MyNewModel2RMSNorm(nn.Module): def __init__(self, dim: int, eps: float = 1e-6): super().__init__() @@ -116,8 +130,7 @@ def eager_attention_forward( attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling if attention_mask is not None: - causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] - attn_weights = attn_weights + causal_mask + attn_weights = attn_weights + attention_mask attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) @@ -160,7 +173,6 @@ def forward( position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, attention_mask: torch.Tensor | None = None, past_key_values: Cache | None = None, - cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, torch.Tensor]: input_shape = hidden_states.shape[:-1] @@ -174,9 +186,7 @@ def forward( query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_values is not None: - # sin and cos are specific to RoPE models; cache_position needed for the static cache - cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} - key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx) attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface( self.config._attn_implementation, eager_attention_forward @@ -216,7 +226,6 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, use_cache: bool | None = False, - cache_position: torch.LongTensor | None = None, position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: @@ -229,7 +238,6 @@ def forward( position_ids=position_ids, past_key_values=past_key_values, use_cache=use_cache, - cache_position=cache_position, position_embeddings=position_embeddings, **kwargs, ) @@ -267,6 +275,8 @@ def _init_weights(self, module): # We initialize with 0s to be 1 centered as the RMSNorm here does (1 + weight) if "RMSNorm" in module.__class__.__name__: init.zeros_(module.weight) + elif isinstance(module, MyNewModel2TextScaledWordEmbedding): + init.constant_(module.embed_scale, module.scalar_embed_scale) class MyNewModel2ForSequenceClassification(GenericForSequenceClassification, MyNewModel2PreTrainedModel): diff --git a/examples/modular-transformers/modeling_new_task_model.py b/examples/modular-transformers/modeling_new_task_model.py index 82dffeebc411..8a080cce6a6a 100644 --- a/examples/modular-transformers/modeling_new_task_model.py +++ b/examples/modular-transformers/modeling_new_task_model.py @@ -20,6 +20,7 @@ from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_compilable_check +from ...utils.deprecation import deprecate_kwarg from ..auto import AutoModel from .configuration_new_task_model import NewTaskModelConfig @@ -139,11 +140,11 @@ def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool: return inner_mask +@deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds") def create_causal_mask_mapping( config: PreTrainedConfig, - input_embeds: torch.Tensor, + inputs_embeds: torch.Tensor, attention_mask: torch.Tensor | None, - cache_position: torch.Tensor, past_key_values: Cache | None, position_ids: torch.Tensor | None, token_type_ids: torch.Tensor | None = None, @@ -163,9 +164,8 @@ def create_causal_mask_mapping( mask_kwargs = { "config": config.get_text_config(), - "input_embeds": input_embeds, + "inputs_embeds": inputs_embeds, "attention_mask": attention_mask, - "cache_position": cache_position, "past_key_values": past_key_values, "position_ids": position_ids, } @@ -190,7 +190,7 @@ def create_causal_mask_mapping( "passing `token_type_ids` to the model to prevent bad attention masking." ) # NOTE: this branch can't be reached when training because `token_type_ids` is required as a model input. - token_type_ids = torch.ones_like(input_embeds)[:, :, 0] + token_type_ids = torch.ones_like(inputs_embeds)[:, :, 0] # Logic originally copied from Gemma3. It holds up for NewTaskModel as well because NewTaskModel assumes up to one image # per prompt AND we reverse `token_type_ids` above. Gemma3 uses a bidirectional mask for images, tagged through @@ -201,13 +201,13 @@ def create_causal_mask_mapping( # First find where a new image block starts: 1 if image and previous not image # The images cannot attend to future images, but can attend to all prev images and to itself bidirectionally - is_image = (token_type_ids == 1).to(cache_position.device) + is_image = (token_type_ids == 1).to(inputs_embeds.device) is_previous_image = nn.functional.pad(is_image, (1, 0), value=0)[:, :-1] new_image_start = is_image & ~is_previous_image image_group_ids = torch.cumsum(new_image_start.int(), dim=1) - 1 image_group_ids = torch.where(is_image, image_group_ids, torch.full_like(token_type_ids, -1)) mask_kwargs["or_mask_function"] = token_type_ids_mask_function( - token_type_ids.to(cache_position.device), image_group_ids + token_type_ids.to(inputs_embeds.device), image_group_ids ) return create_masks_for_generate(**mask_kwargs) @@ -219,7 +219,6 @@ def create_causal_mask_mapping( """ ) class NewTaskModelModel(NewTaskModelPreTrainedModel): - _checkpoint_conversion_mapping = {"language_model.model": "language_model"} # we are filtering the logits/labels so we shouldn't divide the loss based on num_items_in_batch accepts_loss_kwargs = False @@ -248,10 +247,9 @@ def set_input_embeddings(self, value): def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] ) -> tuple | BaseModelOutputWithPooling: - image_outputs = self.vision_tower(pixel_values, return_dict=True, **kwargs) + image_outputs = self.vision_tower(pixel_values, **kwargs) selected_image_feature = image_outputs.last_hidden_state image_features = self.multi_modal_projector(selected_image_feature) - image_features = image_features / (self.config.text_config.hidden_size**0.5) image_outputs.pooler_output = image_features return image_outputs @@ -290,13 +288,9 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, token_type_ids: torch.LongTensor | None = None, - cache_position: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, use_cache: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs: Unpack[FlashAttentionKwargs], ) -> tuple | NewTaskModelModelOutputWithPast: r""" @@ -332,12 +326,6 @@ def forward( if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.return_dict - # Replace image id with PAD if the image token if OOV, to avoid index-errors if input_ids is not None and self.config.image_token_id >= self.vocab_size: special_image_mask = input_ids == self.config.image_token_id @@ -349,18 +337,14 @@ def forward( if inputs_embeds is None: inputs_embeds = self.get_input_embeddings()(llm_input_ids) - if cache_position is None: - past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 - cache_position = torch.arange( - past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device - ) - if position_ids is None: - position_ids = cache_position.unsqueeze(0) + 1 # NewTaskModel positions are 1-indexed + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + position_ids = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device) + past_seen_tokens + position_ids = position_ids.unsqueeze(0) + 1 # NewTaskModel positions are 1-indexed # Merge text and images if pixel_values is not None: - image_features = self.get_image_features(pixel_values, return_dict=True).pooler_output + image_features = self.get_image_features(pixel_values).pooler_output image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) special_image_mask = self.get_placeholder_mask( input_ids, inputs_embeds=inputs_embeds, image_features=image_features @@ -373,7 +357,6 @@ def forward( self.config, inputs_embeds, attention_mask, - cache_position, past_key_values, position_ids, token_type_ids, @@ -387,10 +370,6 @@ def forward( past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=True, - cache_position=cache_position, **kwargs, ) @@ -409,12 +388,6 @@ def forward( """ ) class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin): - _checkpoint_conversion_mapping = { - "^language_model.model": "model.language_model", - "^vision_tower": "model.vision_tower", - "^multi_modal_projector": "model.multi_modal_projector", - "^language_model.lm_head": "lm_head", - } _tied_weights_keys = {"lm_head.weight": "model.language_model.embed_tokens.weight"} main_input_name: ClassVar[str] = "doc_input_ids" # transformers-related @@ -491,7 +464,6 @@ def prepare_inputs_for_generation( input_ids, past_key_values=None, inputs_embeds=None, - cache_position=None, position_ids=None, pixel_values=None, attention_mask=None, @@ -509,7 +481,6 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, attention_mask=attention_mask, position_ids=position_ids, - cache_position=cache_position, use_cache=use_cache, logits_to_keep=logits_to_keep, token_type_ids=token_type_ids, @@ -519,10 +490,11 @@ def prepare_inputs_for_generation( # position_ids in NewTaskModel are 1-indexed if model_inputs.get("position_ids") is not None: - model_inputs["position_ids"] += 1 + # NOTE: we need this op out-of-place, otherwise it modifies the `model_kwargs` dict used in `generate` in-place! + model_inputs["position_ids"] = model_inputs["position_ids"] + 1 # Pixel values are used only in the first iteration if available - # In subsquent iterations, they are already merged with text and cached + # In subsequent iterations, they are already merged with text and cached # NOTE: first iteration doesn't have to be prefill, it can be the first # iteration with a question and cached system prompt (continue generate from cache). NOTE: use_cache=False needs pixel_values always if is_first_iteration or not use_cache: @@ -531,11 +503,11 @@ def prepare_inputs_for_generation( return model_inputs @staticmethod + @deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds") def create_masks_for_generate( config: PreTrainedConfig, - input_embeds: torch.Tensor, + inputs_embeds: torch.Tensor, attention_mask: torch.Tensor | None, - cache_position: torch.Tensor, past_key_values: Cache | None, position_ids: torch.Tensor | None, token_type_ids: torch.Tensor | None = None, @@ -545,9 +517,8 @@ def create_masks_for_generate( # Uses the overwritten `create_masks_for_generate` with `token_type_ids` masking return create_causal_mask_mapping( config, - input_embeds, + inputs_embeds, attention_mask, - cache_position, past_key_values, position_ids, token_type_ids, diff --git a/examples/modular-transformers/modeling_roberta.py b/examples/modular-transformers/modeling_roberta.py index 544b1c58bdc6..7ae436e70351 100644 --- a/examples/modular-transformers/modeling_roberta.py +++ b/examples/modular-transformers/modeling_roberta.py @@ -106,7 +106,6 @@ def eager_attention_forward( attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling if attention_mask is not None: - attention_mask = attention_mask[:, :, :, : key.shape[-2]] attn_weights = attn_weights + attention_mask attn_weights = nn.functional.softmax(attn_weights, dim=-1) @@ -148,7 +147,6 @@ def forward( hidden_states: torch.Tensor, attention_mask: torch.FloatTensor | None = None, past_key_values: Cache | None = None, - cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor]: input_shape = hidden_states.shape[:-1] @@ -166,12 +164,7 @@ def forward( current_past_key_values = past_key_values.self_attention_cache # save all key/value_layer to cache to be re-used for fast auto-regressive generation - key_layer, value_layer = current_past_key_values.update( - key_layer, - value_layer, - self.layer_idx, - {"cache_position": cache_position}, - ) + key_layer, value_layer = current_past_key_values.update(key_layer, value_layer, self.layer_idx) attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface( self.config._attn_implementation, eager_attention_forward @@ -297,7 +290,6 @@ def forward( encoder_hidden_states: torch.FloatTensor | None = None, encoder_attention_mask: torch.FloatTensor | None = None, past_key_values: Cache | None = None, - cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor]: attention_mask = attention_mask if not self.is_cross_attention else encoder_attention_mask @@ -306,7 +298,6 @@ def forward( encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, past_key_values=past_key_values, - cache_position=cache_position, **kwargs, ) attention_output = self.output(attention_output, hidden_states) @@ -369,14 +360,12 @@ def forward( encoder_hidden_states: torch.FloatTensor | None = None, encoder_attention_mask: torch.FloatTensor | None = None, past_key_values: Cache | None = None, - cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor]: + ) -> torch.Tensor: self_attention_output, _ = self.attention( hidden_states, attention_mask, past_key_values=past_key_values, - cache_position=cache_position, **kwargs, ) attention_output = self_attention_output @@ -423,7 +412,6 @@ def forward( encoder_attention_mask: torch.FloatTensor | None = None, past_key_values: Cache | None = None, use_cache: bool | None = None, - cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor] | BaseModelOutputWithPastAndCrossAttentions: for i, layer_module in enumerate(self.layer): @@ -433,7 +421,6 @@ def forward( encoder_hidden_states, # as a positional argument for gradient checkpointing encoder_attention_mask=encoder_attention_mask, past_key_values=past_key_values, - cache_position=cache_position, **kwargs, ) @@ -569,9 +556,11 @@ def forward( encoder_attention_mask: torch.Tensor | None = None, past_key_values: Cache | None = None, use_cache: bool | None = None, - cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: @@ -584,19 +573,7 @@ def forward( else DynamicCache(config=self.config) ) - if (input_ids is None) ^ (inputs_embeds is not None): - raise ValueError("You must specify exactly one of input_ids or inputs_embeds") - - if input_ids is not None: - device = input_ids.device - seq_length = input_ids.shape[1] - else: - device = inputs_embeds.device - seq_length = inputs_embeds.shape[1] - past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 - if cache_position is None: - cache_position = torch.arange(past_key_values_length, past_key_values_length + seq_length, device=device) embedding_output = self.embeddings( input_ids=input_ids, @@ -611,7 +588,6 @@ def forward( encoder_attention_mask=encoder_attention_mask, embedding_output=embedding_output, encoder_hidden_states=encoder_hidden_states, - cache_position=cache_position, past_key_values=past_key_values, ) @@ -622,7 +598,6 @@ def forward( encoder_attention_mask=encoder_attention_mask, past_key_values=past_key_values, use_cache=use_cache, - cache_position=cache_position, position_ids=position_ids, **kwargs, ) @@ -641,28 +616,26 @@ def _create_attention_masks( encoder_attention_mask, embedding_output, encoder_hidden_states, - cache_position, past_key_values, ): if self.config.is_decoder: attention_mask = create_causal_mask( config=self.config, - input_embeds=embedding_output, + inputs_embeds=embedding_output, attention_mask=attention_mask, - cache_position=cache_position, past_key_values=past_key_values, ) else: attention_mask = create_bidirectional_mask( config=self.config, - input_embeds=embedding_output, + inputs_embeds=embedding_output, attention_mask=attention_mask, ) if encoder_attention_mask is not None: encoder_attention_mask = create_bidirectional_mask( config=self.config, - input_embeds=embedding_output, + inputs_embeds=embedding_output, attention_mask=encoder_attention_mask, encoder_hidden_states=encoder_hidden_states, ) diff --git a/examples/modular-transformers/modeling_super.py b/examples/modular-transformers/modeling_super.py index 86931b26baff..825da3a0e932 100644 --- a/examples/modular-transformers/modeling_super.py +++ b/examples/modular-transformers/modeling_super.py @@ -27,7 +27,7 @@ @use_kernel_forward_from_hub("RMSNorm") class SuperRMSNorm(nn.Module): - def __init__(self, hidden_size, eps=1e-6): + def __init__(self, hidden_size, eps: float = 1e-6) -> None: """ SuperRMSNorm is equivalent to T5LayerNorm """ @@ -35,7 +35,7 @@ def __init__(self, hidden_size, eps=1e-6): self.weight = nn.Parameter(torch.ones(hidden_size)) self.variance_epsilon = eps - def forward(self, hidden_states): + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: input_dtype = hidden_states.dtype hidden_states = hidden_states.to(torch.float32) variance = hidden_states.pow(2).mean(-1, keepdim=True) @@ -187,8 +187,7 @@ def eager_attention_forward( attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling if attention_mask is not None: - causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] - attn_weights = attn_weights + causal_mask + attn_weights = attn_weights + attention_mask attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) @@ -231,7 +230,6 @@ def forward( position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, attention_mask: torch.Tensor | None = None, past_key_values: Cache | None = None, - cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, torch.Tensor]: input_shape = hidden_states.shape[:-1] @@ -245,9 +243,7 @@ def forward( query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_values is not None: - # sin and cos are specific to RoPE models; cache_position needed for the static cache - cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} - key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx) attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface( self.config._attn_implementation, eager_attention_forward @@ -287,7 +283,6 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, use_cache: bool | None = False, - cache_position: torch.LongTensor | None = None, position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: @@ -300,7 +295,6 @@ def forward( position_ids=position_ids, past_key_values=past_key_values, use_cache=use_cache, - cache_position=cache_position, position_embeddings=position_embeddings, **kwargs, ) diff --git a/examples/modular-transformers/modeling_switch_function.py b/examples/modular-transformers/modeling_switch_function.py index 5a978fb1b059..8e5bbf65f94a 100644 --- a/examples/modular-transformers/modeling_switch_function.py +++ b/examples/modular-transformers/modeling_switch_function.py @@ -79,8 +79,7 @@ def eager_attention_forward( attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling if attention_mask is not None: - causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] - attn_weights = attn_weights + causal_mask + attn_weights = attn_weights + attention_mask attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) @@ -123,7 +122,6 @@ def forward( position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, attention_mask: torch.Tensor | None = None, past_key_values: Cache | None = None, - cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, torch.Tensor]: input_shape = hidden_states.shape[:-1] @@ -137,9 +135,7 @@ def forward( query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_values is not None: - # sin and cos are specific to RoPE models; cache_position needed for the static cache - cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} - key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx) attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface( self.config._attn_implementation, eager_attention_forward diff --git a/examples/modular-transformers/modeling_test_detr.py b/examples/modular-transformers/modeling_test_detr.py index 818788a9bb01..51b0d178fb8f 100644 --- a/examples/modular-transformers/modeling_test_detr.py +++ b/examples/modular-transformers/modeling_test_detr.py @@ -22,10 +22,10 @@ from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack -from ...pytorch_utils import compile_compatible_method_lru_cache, meshgrid +from ...pytorch_utils import compile_compatible_method_lru_cache from ...utils import ModelOutput, TransformersKwargs, auto_docstring, torch_compilable_check -from ...utils.generic import can_return_tuple, check_model_inputs -from ...utils.output_capturing import OutputRecorder +from ...utils.generic import can_return_tuple, merge_with_config_defaults +from ...utils.output_capturing import OutputRecorder, capture_outputs from .configuration_test_detr import TestDetrConfig @@ -365,7 +365,6 @@ def eager_attention_forward( attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling if attention_mask is not None: - attention_mask = attention_mask[:, :, :, : key.shape[-2]] attn_weights = attn_weights + attention_mask attn_weights = nn.functional.softmax(attn_weights, dim=-1) @@ -630,7 +629,7 @@ def forward( hidden_states = self.final_layer_norm(hidden_states) if self.training: - if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any(): + if not torch.isfinite(hidden_states).all(): clamp_value = torch.finfo(hidden_states.dtype).max - 1000 hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) @@ -822,7 +821,8 @@ def __init__(self, config: TestDetrConfig): # Initialize weights and apply final processing self.post_init() - @check_model_inputs() + @merge_with_config_defaults + @capture_outputs def forward( self, inputs_embeds=None, @@ -889,7 +889,7 @@ def get_reference_points(spatial_shapes_list, valid_ratios, device): """ reference_points_list = [] for level, (height, width) in enumerate(spatial_shapes_list): - ref_y, ref_x = meshgrid( + ref_y, ref_x = torch.meshgrid( torch.linspace(0.5, height - 0.5, height, dtype=valid_ratios.dtype, device=device), torch.linspace(0.5, width - 0.5, width, dtype=valid_ratios.dtype, device=device), indexing="ij", @@ -945,7 +945,8 @@ def __init__(self, config: TestDetrConfig): # Initialize weights and apply final processing self.post_init() - @check_model_inputs() + @merge_with_config_defaults + @capture_outputs def forward( self, inputs_embeds=None, @@ -1186,7 +1187,7 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes) valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1) valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1) - grid_y, grid_x = meshgrid( + grid_y, grid_x = torch.meshgrid( torch.linspace( 0, height - 1, diff --git a/examples/modular-transformers/modeling_test_suffix.py b/examples/modular-transformers/modeling_test_suffix.py index 029e9d3a74b9..894687fdc2db 100644 --- a/examples/modular-transformers/modeling_test_suffix.py +++ b/examples/modular-transformers/modeling_test_suffix.py @@ -25,7 +25,7 @@ class TestSuffixDecoderLayer(nn.module): @use_kernel_forward_from_hub("RMSNorm") class TestSuffixLlamaRMSNorm(nn.Module): - def __init__(self, hidden_size, eps=1e-6): + def __init__(self, hidden_size, eps: float = 1e-6) -> None: """ TestSuffixLlamaRMSNorm is equivalent to T5LayerNorm """ @@ -33,7 +33,7 @@ def __init__(self, hidden_size, eps=1e-6): self.weight = nn.Parameter(torch.ones(hidden_size)) self.variance_epsilon = eps - def forward(self, hidden_states): + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: input_dtype = hidden_states.dtype hidden_states = hidden_states.to(torch.float32) variance = hidden_states.pow(2).mean(-1, keepdim=True) @@ -120,8 +120,7 @@ def eager_attention_forward( attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling if attention_mask is not None: - causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] - attn_weights = attn_weights + causal_mask + attn_weights = attn_weights + attention_mask attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) @@ -164,7 +163,6 @@ def forward( position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, attention_mask: torch.Tensor | None = None, past_key_values: Cache | None = None, - cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, torch.Tensor]: input_shape = hidden_states.shape[:-1] @@ -178,9 +176,7 @@ def forward( query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_values is not None: - # sin and cos are specific to RoPE models; cache_position needed for the static cache - cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} - key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx) attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface( self.config._attn_implementation, eager_attention_forward @@ -220,7 +216,6 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, use_cache: bool | None = False, - cache_position: torch.LongTensor | None = None, position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: @@ -233,7 +228,6 @@ def forward( position_ids=position_ids, past_key_values=past_key_values, use_cache=use_cache, - cache_position=cache_position, position_embeddings=position_embeddings, **kwargs, ) diff --git a/examples/modular-transformers/modular_duplicated_method.py b/examples/modular-transformers/modular_duplicated_method.py index 06d5e03437ef..0c79fe6bdaf7 100644 --- a/examples/modular-transformers/modular_duplicated_method.py +++ b/examples/modular-transformers/modular_duplicated_method.py @@ -3,7 +3,7 @@ class DuplicatedMethodConfig(LlamaConfig): @property - def vocab_size(self): + def vocab_size(self): # noqa: F811 -> we need this at we cannot delete the original for now since config dataclass refactor return 45 @vocab_size.setter diff --git a/examples/modular-transformers/modular_multimodal2.py b/examples/modular-transformers/modular_multimodal2.py index 8f5d564f799b..81751d0815da 100644 --- a/examples/modular-transformers/modular_multimodal2.py +++ b/examples/modular-transformers/modular_multimodal2.py @@ -20,7 +20,6 @@ class Multimodal2VisionModel(CLIPVisionModel): CLIPVisionModel, CLIPVisionTransformer, ) -from transformers.utils import add_start_docstrings class Multimodal2VisionAttention(CLIPAttention): @@ -44,14 +43,6 @@ def __init__(self, config): self.layers = nn.ModuleList([Multimodal2VisionEncoderLayer(config) for _ in range(config.num_hidden_layers)]) -# Finally here the `Vision` part was correct in CLIP, but we still need to tell it that the encoder and attn arg should -# use it as well -class Multimodal2VisionTransformer(CLIPVisionTransformer): - def __init__(self, config): - super().__init__(config) - self.encoder = Multimodal2VisionEncoder(config) - - class Multimodal2VisionPreTrainedModel(CLIPPreTrainedModel): _can_record_outputs = { "hidden_states": Multimodal2VisionEncoderLayer, @@ -63,13 +54,17 @@ def _init_weights(self, module): pass -MULTIMODAL2_VISION_START_DOCSTRING = "doc" +# Finally here the `Vision` part was correct in CLIP, but we still need to tell it that the encoder and attn arg should +# use it as well +class Multimodal2VisionTransformer(CLIPVisionTransformer, Multimodal2VisionPreTrainedModel): + _no_split_modules = ["CLIPEncoderLayer"] + + def __init__(self, config): + super().__init__(config) + self.encoder = Multimodal2VisionEncoder(config) # Here the only arg `self.vision_model = CLIPVisionTransformer(config)` in CLIPVisionModel already has the "Vision" part, so # no need to overwrite it, it will look for `Multimodal2VisionTransformer` which has already being redefined above -# Note: we may want to redefine decorator as well for full consistency, as CLIP does not use "CLIP_VISION_START_DOCSTRING" but only -# "CLIP_START_DOCSTRING" -@add_start_docstrings("New doc", MULTIMODAL2_VISION_START_DOCSTRING) class Multimodal2VisionModel(CLIPVisionModel, Multimodal2VisionPreTrainedModel): _no_split_modules = ["Multimodal2VisionEncoderLayer"] diff --git a/examples/modular-transformers/modular_my_new_model.py b/examples/modular-transformers/modular_my_new_model.py index d6ae897e34f9..539c3471ac28 100644 --- a/examples/modular-transformers/modular_my_new_model.py +++ b/examples/modular-transformers/modular_my_new_model.py @@ -120,7 +120,5 @@ class MyNewModelConfig(LlamaConfig): ``` """ - def __init__(self, mlp_bias=True, new_param=0, **super_kwargs): - super().__init__(self, **super_kwargs) - self.mlp_bias = mlp_bias - self.new_param = new_param + mlp_bias: bool = True + new_param: int = 0 diff --git a/examples/modular-transformers/modular_new_model.py b/examples/modular-transformers/modular_new_model.py index 698babb9d959..01eeb33e4f6c 100644 --- a/examples/modular-transformers/modular_new_model.py +++ b/examples/modular-transformers/modular_new_model.py @@ -4,32 +4,27 @@ class NewModelConfig(GemmaConfig): - def __init__( - self, - vocab_size=256030, - hidden_size=64, - intermediate_size=90, - num_hidden_layers=28, - num_attention_heads=16, - num_key_value_heads=16, - head_dim=256, - hidden_act="gelu_pytorch_tanh", - hidden_activation=None, - max_position_embeddings=1500, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=0, - eos_token_id=1, - bos_token_id=2, - tie_word_embeddings=True, - rope_parameters=None, - attention_bias=False, - attention_dropout=0.0, - use_bidirectional_attention=False, - **kwargs, - ): - super().__init__(self, **kwargs) + vocab_size: int = 256030 + hidden_size: int = 64 + intermediate_size: int = 90 + num_hidden_layers: int = 28 + num_attention_heads: int = 16 + num_key_value_heads: int = 16 + head_dim: int = 256 + hidden_act: str = "gelu_pytorch_tanh" + hidden_activation: str | None = None + max_position_embeddings: int = 1500 + initializer_range: float = 0.02 + rms_norm_eps: float = 1e-6 + use_cache: bool = True + pad_token_id: int = 0 + eos_token_id: int = 1 + bos_token_id: int = 2 + tie_word_embeddings: bool = True + rope_parameters: dict | None = None + attention_bias: bool = False + attention_dropout: float = 0.0 + use_bidirectional_attention: bool = False @property def num_heads(self):