Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 44 additions & 123 deletions examples/modular-transformers/configuration_duplicated_method.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,79 +5,18 @@
# modular_duplicated_method.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨

from huggingface_hub.dataclasses import strict

from ...configuration_utils import PreTrainedConfig
from ...modeling_rope_utils import RopeParameters
from ...utils import auto_docstring
from ...utils.type_validators import interval


@auto_docstring(checkpoint="meta-duplicated_method/DuplicatedMethod-2-7b-hf")
@strict(accept_kwargs=True)
class DuplicatedMethodConfig(PreTrainedConfig):
r"""
This is the configuration class to store the configuration of a [`DuplicatedMethodModel`]. It is used to instantiate an DuplicatedMethod
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the DuplicatedMethod-7B.
e.g. [meta-duplicated_method/DuplicatedMethod-2-7b-hf](https://huggingface.co/meta-duplicated_method/DuplicatedMethod-2-7b-hf)

Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PreTrainedConfig`] for more information.


Args:
vocab_size (`int`, *optional*, defaults to 32000):
Vocabulary size of the DuplicatedMethod model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`DuplicatedMethodModel`]
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 11008):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer decoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer decoder.
num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details, check out [this
paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
`num_attention_heads`.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with. DuplicatedMethod 1 supports up to 2048 tokens,
DuplicatedMethod 2 up to 4096, CodeLlama up to 16384.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
pad_token_id (`int`, *optional*):
Padding token id.
bos_token_id (`int`, *optional*, defaults to 1):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 2):
End of stream token id.
pretraining_tp (`int`, *optional*, defaults to 1):
Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_parameters (`RopeParameters`, *optional*):
Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
with longer `max_position_embeddings`.
attention_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
mlp_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
head_dim (`int`, *optional*):
The attention head dimension. If None, it will default to hidden_size // num_attention_heads

```python
>>> from transformers import DuplicatedMethodModel, DuplicatedMethodConfig

Expand Down Expand Up @@ -109,64 +48,46 @@ class DuplicatedMethodConfig(PreTrainedConfig):
"norm": (["hidden_states"], ["hidden_states"]),
}

def __init__(
self,
vocab_size: int | None = 32000,
hidden_size: int | None = 4096,
intermediate_size: int | None = 11008,
num_hidden_layers: int | None = 32,
num_attention_heads: int | None = 32,
num_key_value_heads: int | None = None,
hidden_act: str | None = "silu",
max_position_embeddings: int | None = 2048,
initializer_range: float | None = 0.02,
rms_norm_eps: int | None = 1e-6,
use_cache: bool | None = True,
pad_token_id: int | None = None,
bos_token_id: int | None = 1,
eos_token_id: int | None = 2,
pretraining_tp: int | None = 1,
tie_word_embeddings: bool | None = False,
rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
attention_bias: bool | None = False,
attention_dropout: float | None = 0.0,
mlp_bias: bool | None = False,
head_dim: int | None = None,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads

# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads

self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.pretraining_tp = pretraining_tp
self.use_cache = use_cache
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.mlp_bias = mlp_bias
self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
self.rope_parameters = rope_parameters

super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
vocab_size: int = 32000
hidden_size: int = 4096
intermediate_size: int = 11008
num_hidden_layers: int = 32
num_attention_heads: int = 32
num_key_value_heads: int | None = None
hidden_act: str = "silu"
max_position_embeddings: int = 2048
initializer_range: float = interval(min=0.0, max=1.0)(default=0.02)
rms_norm_eps: float = 1e-6
use_cache: bool = True
pad_token_id: int | None = None
bos_token_id: int | None = 1
eos_token_id: int | list[int] | None = 2
pretraining_tp: int | None = 1
tie_word_embeddings: bool = False
rope_parameters: RopeParameters | dict | None = None
attention_bias: bool = False
attention_dropout: int | float | None = 0.0
mlp_bias: bool = False
head_dim: int | None = None

def __post_init__(self, **kwargs):
if self.head_dim is None:
self.head_dim = self.hidden_size // self.num_attention_heads
if self.num_key_value_heads is None:
self.num_key_value_heads = self.num_attention_heads

super().__post_init__(**kwargs)

def validate_architecture(self):
"""Part of `@strict`-powered validation. Validates the architecture of the config."""
if self.hidden_size % self.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
f"heads ({self.num_attention_heads})."
)

@property
def vocab_size(self):
def vocab_size(self): # noqa: F811 -> we need this at we cannot delete the original for now since config dataclass refactor
return 45

@vocab_size.setter
Expand Down
102 changes: 45 additions & 57 deletions examples/modular-transformers/configuration_my_new_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,16 @@
# modular_my_new_model.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨

from huggingface_hub.dataclasses import strict

from ...configuration_utils import PreTrainedConfig
from ...modeling_rope_utils import RopeParameters
from ...utils import auto_docstring
from ...utils.type_validators import interval


@auto_docstring(checkpoint="meta-my_new_model/MyNewModel-2-7b-hf")
@strict(accept_kwargs=True)
class MyNewModelConfig(PreTrainedConfig):
r"""
This is the configuration class to store the configuration of a [`MyNewModelModel`]. It is used to instantiate an MyNewModel
Expand Down Expand Up @@ -145,60 +151,42 @@ class MyNewModelConfig(PreTrainedConfig):
"norm": (["hidden_states"], ["hidden_states"]),
}

def __init__(
self,
vocab_size: int | None = 32000,
hidden_size: int | None = 4096,
intermediate_size: int | None = 11008,
num_hidden_layers: int | None = 32,
num_attention_heads: int | None = 32,
num_key_value_heads: int | None = None,
hidden_act: str | None = "silu",
max_position_embeddings: int | None = 2048,
initializer_range: float | None = 0.02,
rms_norm_eps: int | None = 1e-6,
use_cache: bool | None = True,
pad_token_id: int | None = None,
bos_token_id: int | None = 1,
eos_token_id: int | None = 2,
pretraining_tp: int | None = 1,
tie_word_embeddings: bool | None = False,
rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
attention_bias: bool | None = False,
attention_dropout: float | None = 0.0,
mlp_bias=True,
head_dim: int | None = None,
new_param=0,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads

# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads

self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.pretraining_tp = pretraining_tp
self.use_cache = use_cache
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.mlp_bias = mlp_bias
self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
self.rope_parameters = rope_parameters

super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
self.new_param = new_param
vocab_size: int = 32000
hidden_size: int = 4096
intermediate_size: int = 11008
num_hidden_layers: int = 32
num_attention_heads: int = 32
num_key_value_heads: int | None = None
hidden_act: str = "silu"
max_position_embeddings: int = 2048
initializer_range: float = interval(min=0.0, max=1.0)(default=0.02)
rms_norm_eps: float = 1e-6
use_cache: bool = True
pad_token_id: int | None = None
bos_token_id: int | None = 1
eos_token_id: int | list[int] | None = 2
pretraining_tp: int | None = 1
tie_word_embeddings: bool = False
rope_parameters: RopeParameters | dict | None = None
attention_bias: bool = False
attention_dropout: int | float | None = 0.0

mlp_bias: bool = True
head_dim: int | None = None
new_param: int = 0

def __post_init__(self, **kwargs):
if self.head_dim is None:
self.head_dim = self.hidden_size // self.num_attention_heads
if self.num_key_value_heads is None:
self.num_key_value_heads = self.num_attention_heads

super().__post_init__(**kwargs)

def validate_architecture(self):
"""Part of `@strict`-powered validation. Validates the architecture of the config."""
if self.hidden_size % self.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
f"heads ({self.num_attention_heads})."
)
Loading
Loading