huggingface · molbap · Nov 1, 2024 · Oct 31, 2024 · Oct 31, 2024 · Oct 31, 2024
diff --git a/src/transformers/models/gemma2/configuration_gemma2.py b/src/transformers/models/gemma2/configuration_gemma2.py
@@ -36,15 +36,15 @@ class Gemma2Config(PretrainedConfig):
         vocab_size (`int`, *optional*, defaults to 256000):
             Vocabulary size of the Gemma2 model. Defines the number of different tokens that can be represented by the
             `inputs_ids` passed when calling [`Gemma2Model`]
-        hidden_size (`int`, *optional*, defaults to 3072):
+        hidden_size (`int`, *optional*, defaults to 2304):
             Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 24576):
+        intermediate_size (`int`, *optional*, defaults to 9216):
             Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 28):
+        num_hidden_layers (`int`, *optional*, defaults to 26):
             Number of hidden layers in the Transformer decoder.
-        num_attention_heads (`int`, *optional*, defaults to 16):
+        num_attention_heads (`int`, *optional*, defaults to 8):
             Number of attention heads for each attention layer in the Transformer decoder.
-        num_key_value_heads (`int`, *optional*, defaults to 16):
+        num_key_value_heads (`int`, *optional*, defaults to 4):
             This is the number of key_value heads that should be used to implement Grouped Query Attention. If
             `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
             `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
@@ -80,7 +80,7 @@ class Gemma2Config(PretrainedConfig):
             Whether to use a bias in the query, key, value and output projection layers during self-attention.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        query_pre_attn_scalar (`float`, *optional*, defaults to 224): scaling factor used on the attention scores
+        query_pre_attn_scalar (`float`, *optional*, defaults to 256): scaling factor used on the attention scores
         sliding_window (`int`, *optional*, defaults to 4096): in Gemma2, every other layer uses sliding window attention. This is the
             size of the sliding window.
         final_logit_softcapping (`float`, *optional*, defaults to 30.0): scaling factor when applying tanh softcapping on the logits.
@@ -103,11 +103,11 @@ class Gemma2Config(PretrainedConfig):
     def __init__(
         self,
         vocab_size=256000,
-        hidden_size=3072,
-        intermediate_size=24576,
-        num_hidden_layers=28,
-        num_attention_heads=16,
-        num_key_value_heads=16,
+        hidden_size=2304,
+        intermediate_size=9216,
+        num_hidden_layers=26,
+        num_attention_heads=8,
+        num_key_value_heads=4,
         head_dim=256,
         hidden_activation="gelu_pytorch_tanh",
         max_position_embeddings=8192,
@@ -121,7 +121,7 @@ def __init__(
         rope_theta=10000.0,
         attention_bias=False,
         attention_dropout=0.0,
-        query_pre_attn_scalar=224,
+        query_pre_attn_scalar=256,
         sliding_window=4096,
         final_logit_softcapping=30.0,
         attn_logit_softcapping=50.0,

diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py
@@ -67,15 +67,15 @@ class Gemma2Config(PretrainedConfig):
         vocab_size (`int`, *optional*, defaults to 256000):
             Vocabulary size of the Gemma2 model. Defines the number of different tokens that can be represented by the
             `inputs_ids` passed when calling [`Gemma2Model`]
-        hidden_size (`int`, *optional*, defaults to 3072):
+        hidden_size (`int`, *optional*, defaults to 2304):
             Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 24576):
+        intermediate_size (`int`, *optional*, defaults to 9216):
             Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 28):
+        num_hidden_layers (`int`, *optional*, defaults to 26):
             Number of hidden layers in the Transformer decoder.
-        num_attention_heads (`int`, *optional*, defaults to 16):
+        num_attention_heads (`int`, *optional*, defaults to 8):
             Number of attention heads for each attention layer in the Transformer decoder.
-        num_key_value_heads (`int`, *optional*, defaults to 16):
+        num_key_value_heads (`int`, *optional*, defaults to 4):
             This is the number of key_value heads that should be used to implement Grouped Query Attention. If
             `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
             `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
@@ -111,7 +111,7 @@ class Gemma2Config(PretrainedConfig):
             Whether to use a bias in the query, key, value and output projection layers during self-attention.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        query_pre_attn_scalar (`float`, *optional*, defaults to 224): scaling factor used on the attention scores
+        query_pre_attn_scalar (`float`, *optional*, defaults to 256): scaling factor used on the attention scores
         sliding_window (`int`, *optional*, defaults to 4096): in Gemma2, every other layer uses sliding window attention. This is the
             size of the sliding window.
         final_logit_softcapping (`float`, *optional*, defaults to 30.0): scaling factor when applying tanh softcapping on the logits.
@@ -134,11 +134,11 @@ class Gemma2Config(PretrainedConfig):
     def __init__(
         self,
         vocab_size=256000,
-        hidden_size=3072,
-        intermediate_size=24576,
-        num_hidden_layers=28,
-        num_attention_heads=16,
-        num_key_value_heads=16,
+        hidden_size=2304,
+        intermediate_size=9216,
+        num_hidden_layers=26,
+        num_attention_heads=8,
+        num_key_value_heads=4,
         head_dim=256,
         hidden_activation="gelu_pytorch_tanh",
         max_position_embeddings=8192,
@@ -152,7 +152,7 @@ def __init__(
         rope_theta=10000.0,
         attention_bias=False,
         attention_dropout=0.0,
-        query_pre_attn_scalar=224,
+        query_pre_attn_scalar=256,
         sliding_window=4096,
         final_logit_softcapping=30.0,
         attn_logit_softcapping=50.0,