Merge pull request #26 from danny-1k/devdev

minor bug fix -> Correct Use of dropout probability as Norm epsilon
HMUNACHI · Mar 15, 2024 · 833aebe · 833aebe
2 parents bccf348 + 190b03d
commit 833aebe
Show file tree

Hide file tree

Showing 5 changed files with 20 additions and 20 deletions.
diff --git a/docs/examples/mistral_copy_example.ipynb b/docs/examples/mistral_copy_example.ipynb
@@ -311,9 +311,9 @@
     "                                                          shift_size=self.shift_size)\n",
     "        \n",
     "        self.feed_forward = PositionWiseFFN(self.feedforward_dim, self.hidden_dim)\n",
-    "        self.norm1 = nn.RMSNorm(self.dropout)\n",
-    "        self.norm2 = nn.RMSNorm(self.dropout)\n",
-    "        self.norm3 = nn.RMSNorm(self.dropout)\n",
+    "        self.norm1 = nn.RMSNorm()\n",
+    "        self.norm2 = nn.RMSNorm()\n",
+    "        self.norm3 = nn.RMSNorm()\n",
     "        self.dropout1 = nn.Dropout(self.dropout)\n",
     "        self.dropout2 = nn.Dropout(self.dropout)\n",
     "        self.dropout3 = nn.Dropout(self.dropout)\n",

diff --git a/nanodl/__src/models/gemma.py b/nanodl/__src/models/gemma.py
@@ -206,8 +206,8 @@ def setup(self):
                                                           num_heads=self.num_heads,
                                                           num_groups=self.num_groups)
         self.feed_forward = GemmaMLP(self.feedforward_dim, self.hidden_dim)
-        self.norm1 = nn.RMSNorm(self.dropout)
-        self.norm2 = nn.RMSNorm(self.dropout)
+        self.norm1 = nn.RMSNorm()
+        self.norm2 = nn.RMSNorm()
         self.dropout1 = nn.Dropout(self.dropout)
         self.dropout2 = nn.Dropout(self.dropout)
 

diff --git a/nanodl/__src/models/gpt.py b/nanodl/__src/models/gpt.py
@@ -141,9 +141,9 @@ def setup(self):
         self.attention1 = SelfMultiHeadAttention(hidden_dim=self.hidden_dim, num_heads=self.num_heads)
         self.attention2 = SelfMultiHeadAttention(hidden_dim=self.hidden_dim, num_heads=self.num_heads)
         self.feed_forward = PositionWiseFFN(self.feedforward_dim, self.hidden_dim)
-        self.norm1 = nn.LayerNorm(self.dropout)
-        self.norm2 = nn.LayerNorm(self.dropout)
-        self.norm3 = nn.LayerNorm(self.dropout)
+        self.norm1 = nn.LayerNorm()
+        self.norm2 = nn.LayerNorm()
+        self.norm3 = nn.LayerNorm()
         self.dropout1 = nn.Dropout(self.dropout)
         self.dropout2 = nn.Dropout(self.dropout)
         self.dropout3 = nn.Dropout(self.dropout)
@@ -554,9 +554,9 @@ def setup(self):
                                                    self.hidden_dim, 
                                                    self.num_experts, 
                                                    self.top_k)
-        self.norm1 = nn.LayerNorm(self.dropout)
-        self.norm2 = nn.LayerNorm(self.dropout)
-        self.norm3 = nn.LayerNorm(self.dropout)
+        self.norm1 = nn.LayerNorm()
+        self.norm2 = nn.LayerNorm()
+        self.norm3 = nn.LayerNorm()
         self.dropout1 = nn.Dropout(self.dropout)
         self.dropout2 = nn.Dropout(self.dropout)
         self.dropout3 = nn.Dropout(self.dropout)

diff --git a/nanodl/__src/models/llama.py b/nanodl/__src/models/llama.py
@@ -208,9 +208,9 @@ def setup(self):
                                                           num_heads=self.num_heads,
                                                           num_groups=self.num_groups)
         self.feed_forward = PositionWiseFFN(self.feedforward_dim, self.hidden_dim)
-        self.norm1 = nn.RMSNorm(self.dropout)
-        self.norm2 = nn.RMSNorm(self.dropout)
-        self.norm3 = nn.RMSNorm(self.dropout)
+        self.norm1 = nn.RMSNorm()
+        self.norm2 = nn.RMSNorm()
+        self.norm3 = nn.RMSNorm()
         self.dropout1 = nn.Dropout(self.dropout)
         self.dropout2 = nn.Dropout(self.dropout)
         self.dropout3 = nn.Dropout(self.dropout)

diff --git a/nanodl/__src/models/mistral.py b/nanodl/__src/models/mistral.py
@@ -258,9 +258,9 @@ def setup(self):
                                                           shift_size=self.shift_size)
 
         self.feed_forward = PositionWiseFFN(self.feedforward_dim, self.hidden_dim)
-        self.norm1 = nn.RMSNorm(self.dropout)
-        self.norm2 = nn.RMSNorm(self.dropout)
-        self.norm3 = nn.RMSNorm(self.dropout)
+        self.norm1 = nn.RMSNorm()
+        self.norm2 = nn.RMSNorm()
+        self.norm3 = nn.RMSNorm()
         self.dropout1 = nn.Dropout(self.dropout)
         self.dropout2 = nn.Dropout(self.dropout)
         self.dropout3 = nn.Dropout(self.dropout)
@@ -686,9 +686,9 @@ def setup(self):
                                                           shift_size=self.shift_size)
 
         self.feed_forward = SparseMixtureOfExperts(self.feedforward_dim, self.hidden_dim)
-        self.norm1 = nn.RMSNorm(self.dropout)
-        self.norm2 = nn.RMSNorm(self.dropout)
-        self.norm3 = nn.RMSNorm(self.dropout)
+        self.norm1 = nn.RMSNorm()
+        self.norm2 = nn.RMSNorm()
+        self.norm3 = nn.RMSNorm()
         self.dropout1 = nn.Dropout(self.dropout)
         self.dropout2 = nn.Dropout(self.dropout)
         self.dropout3 = nn.Dropout(self.dropout)