Skip to content

Commit 103dba2

Browse files
authored
Add MLP with batchNorm layer (#123)
* Add mlp with batchnorm layer Signed-off-by: Aisuko <[email protected]> * fix the wrong length of vocab size Signed-off-by: Aisuko <[email protected]> --------- Signed-off-by: Aisuko <[email protected]>
1 parent ddbad75 commit 103dba2

File tree

5 files changed

+357
-2
lines changed

5 files changed

+357
-2
lines changed

Diff for: .devcontainer/devcontainer.json

+2-2
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
"ms-python.vscode-pylance"
1313
]
1414
}
15-
}
15+
},
1616

1717
// Features to add to the dev container. More info: https://containers.dev/features.
1818
// "features": {},
@@ -21,7 +21,7 @@
2121
// "forwardPorts": [],
2222

2323
// Use 'postCreateCommand' to run commands after the container is created.
24-
// "postCreateCommand": "pip3 install --user -r requirements.txt",
24+
"postCreateCommand": "make prepare"
2525

2626
// Configure tool-specific properties.
2727
// "customizations": {},

Diff for: Makefile

+2
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ source:
6767
@poetry config repositories.source https://pypi.org/project/kimchima
6868

6969

70+
.PHONY: prepare
71+
prepare: poetry install-dev
7072

7173
###################################################################################################
7274
# Commit and recommit changes to github

Diff for: src/models/mlp_batchnorm.py

+196
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
# coding=utf-8
2+
3+
# Copyright [2024] [SkywardAI]
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
17+
import random
18+
import torch
19+
import torch.nn.functional as F
20+
from torch.utils.tensorboard import SummaryWriter
21+
22+
# https://www.kaggle.com/code/aisuko/implement-neural-net-with-batch-norm-layer
23+
24+
g=torch.Generator().manual_seed(2147483647)
25+
26+
class Linear:
27+
"""
28+
Linear layer
29+
https://pytorch.org/docs/stable/generated/torch.nn.Linear.html
30+
"""
31+
32+
def __init__(self, fan_in, fan_out, bias=True):
33+
self.weight=torch.randn((fan_in, fan_out), generator=g)/fan_in**0.5 # unit gaussian
34+
self.bias=torch.zeros(fan_out) if bias else None # default bias initialize to zeros
35+
36+
def __call__(self, x):
37+
self.out=x@self.weight
38+
if self.bias is not None:
39+
self.out+=self.bias
40+
return self.out
41+
42+
def parameters(self):
43+
"""
44+
return tensors that are parameters of this layer
45+
"""
46+
return [self.weight]+([] if self.bias is None else [self.bias])
47+
48+
49+
class BatchNorm1d:
50+
"""
51+
batchnorm layer
52+
https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm1d.html formula
53+
"""
54+
55+
def __init__(self, dim, eps=1e-5, momentum=0.1):
56+
self.eps=eps # used in division
57+
self.momentum=momentum # keep tracking running stats
58+
self.training=True
59+
60+
# parameters (trained with backprop)
61+
self.gamma=torch.ones(dim)
62+
self.beta=torch.zeros(dim)
63+
# buffers (trained with a running 'momentum update')
64+
self.running_mean=torch.zeros(dim)
65+
self.running_var=torch.ones(dim)
66+
67+
def __call__(self, x):
68+
"""
69+
Follow https://arxiv.org/pdf/1502.03167
70+
71+
Algorithm 1
72+
1.mini-batch mean
73+
2.mini-batch variance
74+
3.normalize
75+
4.scale and shift
76+
"""
77+
78+
# calculating the forward pass
79+
if self.training:
80+
xmean=x.mean(0, keepdim=True) # batch mean
81+
xvar=x.var(0, keepdim=True, unbiased=True) # batch variance
82+
else:
83+
xmean=self.running_mean
84+
xvar=self.running_var
85+
86+
xhat=(x-xmean)/torch.sqrt(xvar+self.eps) # normalize to unit variance
87+
self.out=self.gamma*xhat+self.beta # craete otu attribute for visualization training process
88+
# update the buffers
89+
90+
if self.training:
91+
with torch.no_grad():
92+
self.running_mean=(1-self.momentum)*self.running_mean+self.momentum*xmean
93+
self.running_var=(1-self.momentum)*self.running_var+self.momentum*xvar
94+
return self.out
95+
96+
def parameters(self):
97+
return [self.gamma, self.beta]
98+
99+
class Tanh:
100+
"""
101+
"""
102+
def __call__(self, x):
103+
self.out=torch.tanh(x)
104+
return self.out
105+
106+
def parameters(self):
107+
"""
108+
no parameters in this layer
109+
"""
110+
return []
111+
112+
113+
class MlpBatchNormTrainer:
114+
"""
115+
MlpBatchNormTrainer
116+
"""
117+
ds_url="https://www.kaggleusercontent.com/kf/187064505/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..ZLA9V0sWqB_Px0312U15fQ.OO2wvSdp-fhBB0BTDAaLToek6CLGlzS4otHIsyHBd1feEJxIUq055-GIQb24Ez51pGq31hyzaN_vFeDRnqxFwyc12sDNqZ7uDhel-5xeXU08h0qNtOpoqXA-iJpPuV4u-dThq8Lk-zoOg_ZDmVNAW8XHZVAM2ZAHl9StyqN1n7eOGU0379mp_2ol2gyjXP01xNDH2n4kUSIIetktnagIon8Jm_tcLBB-DaWPTFwQ5L7NBP1t-omCUrKydTxAyPIFFwnid3T1vzEgSmYiUY8Ec-iC8OG5d2pKcod9FIAOkJH4Xu74Pvzp5UuOFzQXRByezEOkyD0ltAhfMOab0ebIi6YSTVKrna70HZhuxjWQRK9fIgvt0V7RMz84ZQspJWrgofowQrf7E1avVvXe7GQW4E7dYITqQoJvZ7dhlpujq1db6pkegRqOfuQzPJcD6UHBTpVRyi36rIQoLpFd63XLzY5eya4ScAy5H-frQhF0IU927Z86S9iR2AypqO3TXriPsMHjJ7o-DwXpnHCNkVfMJXeVxT36DRBiV9uCTL-e8_xOUKw50N5iG3NqTnos0IwSXvwrSBtHxUI71zo-I2Z-l5x_GqjEa9QVl1XX_q7GU_YFejlC-rT9KdcA_6TEVO6qaMpfvVvCc9kFYI7s7GQNbg.tIuWJu1a71qSZKZeG-TgPg/names.txt"
118+
n_embed=10 # the dimensionality of the character embedding vectors
119+
n_hidden=100 # the number of neurons in the hidden layer of the MLP
120+
n_block_size=3 # context length: how many characters do we take to predict the next one?
121+
max_steps=200000
122+
batch_size=32
123+
124+
125+
def __init__(self):
126+
raise Exception("This class is not meant to be instantiated")
127+
128+
@classmethod
129+
def set_hyperparameters(cls, **kwargs):
130+
"""
131+
Set hyperparameters
132+
"""
133+
cls.n_embed=kwargs.get("n_embed", 10)
134+
cls.n_hidden=kwargs.get("n_hidden", 100)
135+
cls.n_block_size=kwargs.get("n_block_size", 3)
136+
cls.max_steps=kwargs.get("max_steps", 200000)
137+
cls.batch_size=kwargs.get("batch_size", 32)
138+
139+
@classmethod
140+
def load_dataset(cls, filePath:str)->str:
141+
"""
142+
Load the dataset
143+
"""
144+
with open(filePath, "r", encoding="utf-8") as f:
145+
text=f.read()
146+
return text
147+
148+
@classmethod
149+
def unique_chars(cls, text:str)->list:
150+
"""
151+
Get all the unique characters in the text
152+
"""
153+
return sorted(list(set(''.join(text))))
154+
155+
@classmethod
156+
def stoi(cls, chars:list)->dict:
157+
"""
158+
Convert characters to indices
159+
"""
160+
stoi={char:i+1 for i,char in enumerate(chars)}
161+
stoi['.']=0
162+
return stoi
163+
164+
@classmethod
165+
def itos(cls, chars:list)->dict:
166+
"""
167+
Convert indices to characters
168+
"""
169+
itos={i:char for char,i in cls.stoi(chars).items()}
170+
return itos
171+
172+
@classmethod
173+
def build_vocab(cls, chars:list)->int:
174+
"""
175+
Build a vocabulary from the unique characters
176+
"""
177+
return len(chars)
178+
179+
@classmethod
180+
def build_dataset(cls, words:str, stoi: dict)->tuple[torch.Tensor, torch.Tensor]:
181+
"""
182+
Build the dataset
183+
"""
184+
X,Y=[],[]
185+
186+
for w in words:
187+
context=[0]*cls.n_block_size
188+
for ch in w+'.':
189+
ix=stoi[ch]
190+
X.append(context)
191+
Y.append(ix)
192+
context=context[1:]+[ix] # crop and append
193+
194+
X=torch.tensor(X) # convert to tensor
195+
Y=torch.tensor(Y)
196+
return X,Y

Diff for: src/models/simple_gpt.py

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from torch.nn import functional as F
2020
from torch.utils.tensorboard import SummaryWriter
2121

22+
# https://www.kaggle.com/code/aisuko/gpt-from-scratch-as-a-script
2223

2324
class SimpleGPT(nn.Module):
2425
def __init__(self, vocab_size):

Diff for: src/tests/test_mlp_batchnorm.py

+156
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
# coding=utf-8
2+
3+
# Copyright [2024] [SkywardAI]
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
import os
17+
import random
18+
import unittest
19+
20+
from pathlib import Path
21+
import torch
22+
from torch.nn import functional as F
23+
24+
from models.mlp_batchnorm import MlpBatchNormTrainer,Linear, BatchNorm1d, Tanh
25+
from pkg.dataset_helper import DatasetHelper
26+
27+
28+
class TestMLPBatchNorm(unittest.TestCase):
29+
30+
@classmethod
31+
def setUpClass(cls):
32+
cls.n_embd=MlpBatchNormTrainer.n_embed
33+
cls.n_hidden=MlpBatchNormTrainer.n_hidden
34+
cls.n_block_size=MlpBatchNormTrainer.n_block_size
35+
36+
src_dir = Path(os.path.dirname(os.path.abspath(__file__))).parent
37+
abs_file_path = os.path.join(src_dir, "input.txt")
38+
_ = DatasetHelper.download_remote_file(MlpBatchNormTrainer.ds_url, abs_file_path)
39+
cls.data=MlpBatchNormTrainer.load_dataset(abs_file_path)
40+
cls.unique_chars=MlpBatchNormTrainer.unique_chars(cls.data.splitlines())
41+
cls.stoi=MlpBatchNormTrainer.stoi(cls.unique_chars)
42+
cls.itos=MlpBatchNormTrainer.itos(cls.unique_chars)
43+
cls.vocab_size=MlpBatchNormTrainer.build_vocab(cls.itos)
44+
45+
def test_mlp_batchnorm_trainer(self):
46+
47+
self.assertEqual(self.vocab_size,27)
48+
random.seed(42)
49+
words=self.data.splitlines()
50+
random.shuffle(words)
51+
52+
n1=int(0.8*len(words))
53+
n2=int(0.9*len(words))
54+
55+
Xtr, Ytr=MlpBatchNormTrainer.build_dataset(words[:n1], self.stoi) # 80%
56+
Xdev, Ydev=MlpBatchNormTrainer.build_dataset(words[n1:n2],self.stoi) # 10%
57+
Xte, Yte=MlpBatchNormTrainer.build_dataset(words[n2:],self.stoi) # 10%
58+
g=torch.Generator().manual_seed(2147483647)
59+
self.assertEqual(self.n_embd, 10)
60+
C=torch.randn((self.vocab_size, self.n_embd), generator=g)
61+
62+
self.assertEqual(C.shape, torch.Size([27, 10]))
63+
64+
# sequential 6 MLP layers
65+
layers=[
66+
Linear(self.n_embd*self.n_block_size, self.n_hidden, bias=False), BatchNorm1d(self.n_hidden), Tanh(),
67+
Linear(self.n_hidden, self.n_hidden, bias=False), BatchNorm1d(self.n_hidden), Tanh(),
68+
Linear(self.n_hidden, self.n_hidden, bias=False), BatchNorm1d(self.n_hidden), Tanh(),
69+
Linear(self.n_hidden, self.n_hidden, bias=False), BatchNorm1d(self.n_hidden), Tanh(),
70+
Linear(self.n_hidden, self.n_hidden, bias=False), BatchNorm1d(self.n_hidden), Tanh(),
71+
Linear(self.n_hidden, self.vocab_size, bias=False), BatchNorm1d(self.vocab_size)
72+
]
73+
74+
with torch.no_grad():
75+
# here, out latest layer is a batch norm layer and we wouldn't change the weights to make the softmax less confident
76+
# we would like to changing the gamma(from the batch norm paper algorithm1)
77+
# because gamma remember int he batchnorm is the variable that multiplicatively interacts with the output of thah normalization
78+
layers[-1].gamma*=0.1
79+
80+
# all pther layers: apply again
81+
for layer in layers[:-1]:
82+
if isinstance(layer, Linear):
83+
layer.weight*=5/3 # booster the linear layer by the gain, the number from torch document
84+
# [C] the embedding matrix and all the parameters of all the layers
85+
parameters=[C]+[p for layer in layers for p in layer.parameters()]
86+
print(sum(p.nelement() for p in parameters)) # number of parameters in total
87+
for p in parameters:
88+
p.requires_grad=True
89+
90+
91+
# training loop
92+
lossi=[]
93+
ud=[]
94+
95+
for i in range(MlpBatchNormTrainer.max_steps):
96+
# minibatch construct
97+
ix=torch.randint(0, Xtr.shape[0], (MlpBatchNormTrainer.batch_size,), generator=g)
98+
Xb, Yb=Xtr[ix], Ytr[ix] # batch X,Y
99+
100+
# forward pass
101+
emb= C[Xb] # embed the characters into vectors
102+
x=emb.view(emb.shape[0], -1) # flatten/concatenate the vectors
103+
for layer in layers:
104+
x=layer(x)
105+
loss=F.cross_entropy(x, Yb) # loss function
106+
107+
# backward pass
108+
for layer in layers:
109+
layer.out.retain_grad()
110+
111+
for p in parameters:
112+
p.grad=None
113+
114+
loss.backward()
115+
116+
# update
117+
lr=0.1 if i<100000 else 0.01 # step learning rate decay
118+
for p in parameters:
119+
p.data+=-lr*p.grad
120+
121+
# track stats
122+
if i%10000==0: # print every once in a while
123+
print(f'{i:7d}/{MlpBatchNormTrainer.max_steps:7d}: {loss.item():.4f}')
124+
lossi.append(loss.log10().item())
125+
126+
with torch.no_grad():
127+
ud.append([(lr*p.grad.std()/p.data.std()).log10().item() for p in parameters])
128+
129+
if i>=1000:
130+
break
131+
132+
for layer in layers:
133+
layer.training=False
134+
135+
g=torch.Generator().manual_seed(2147483647+10)
136+
137+
for _ in range(20):
138+
out=[]
139+
context=[0]*self.n_block_size
140+
while True:
141+
#forward pass the neural net
142+
emb=C[torch.tensor([context])] # (1, block_size, n_embd)
143+
x=emb.view(emb.shape[0], -1) # concatenate the vectors
144+
for layer in layers:
145+
x=layer(x)
146+
logits=x
147+
probs=F.softmax(logits, dim=1)
148+
self.assertEqual(probs.shape, torch.Size([1, 27]))
149+
# sample from the distribution
150+
ix=torch.multinomial(probs, num_samples=1, generator=g).item()
151+
# shift the contetx window and track the samples
152+
context=context[1:]+[ix]
153+
out.append(ix)
154+
if ix==0:
155+
break
156+
self.assertIsNotNone(''.join(self.itos[i] for i in out[:-1]))

0 commit comments

Comments
 (0)