-
Notifications
You must be signed in to change notification settings - Fork 0
/
MLP.py
237 lines (187 loc) · 13.3 KB
/
MLP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
from ExpandableModel import ExpandableModel
import torch.nn as nn
import scipy.stats as stats
import torch
from torch.autograd import Variable
class MLP(ExpandableModel):
def __init__(self, hidden_size, input_size, output_size, device):
super().__init__(hidden_size, input_size, output_size, device)
self.build()
# todo use XAVIER 10 method for weight initialization
self.apply(self.init_weights_xavier)
def forward(self, x):
# pass the data through all layers of the network
for module in self.modulelist:
x = module(x)
self.y = x
return self.y
def build(self):
self.modulelist = nn.ModuleList()
self.modulelist.append(nn.Linear(self.input_size, self.hidden_size))
self.modulelist.append(nn.ReLU())
self.modulelist.append(nn.Linear(self.hidden_size, self.hidden_size))
self.modulelist.append(nn.ReLU())
self.modulelist.append(nn.Linear(self.hidden_size, self.output_size))
def test(self, test_loaders, threshold, args):
test_accuracies = [0]
# generate a dictionary mapping tasks to models of the sizes that the network was when those tasks were
# trained, containing subsets of the weights currently in the model (to mask new, post-expansion weights
# when testing on tasks for which the weights did not exist during training)
models = self.generate_model_dictionary()
# Test the model on ALL tasks, including that on which the model was most recently trained
for task_number, test_loader in enumerate(test_loaders):
# from a dictionary formatted as {task number: model to use when testing that task number}, generated by
# utils.generate_model_dictionary(), fetch the model to be used when testing this task (so as to mask
# weights which should not be taken into consideration)
model = models.get(task_number + 1)
model.restore_output_weights(task_number + 1) # todo add to CNN
# Set the module in "evaluation mode"
# This is necessary because some network layers behave differently when training vs testing.
# Dropout, for example, is used to zero/mask certain weights during TRAINING (e.g. model.train())
# to prevent overfitting. However, during TESTING/EVALUATION we do not want this to happen.
model.eval()
# total testing loss over all test batches for the given task_number's entire testset (sum)
test_loss = 0
# total number of correct predictions over the given task_number's entire testset (sum)
correct = 0
# Wrap in torch.no_grad() because weights have requires_grad=True (meaning pyTorch autograd knows to
# automatically track history of computed gradients for those weights) but we don't need to track testing
# in autograd - we are no longer training so gradients should no longer be altered/computed (only "used")
# and therefore we don't need to track this.
with torch.no_grad():
# Each step of the iterator test_loader will return the following values:
#
# data: a 4D tensor of dimensions (test batch size, 1, 28, 28), representing the MNIST data
# for each of the 28 x 28 = 784 pixels of each of the images in a given test batch
#
# target: a 1D tensor of dimension <test batch size> containing ground truth labels for each of the
# images in the corresponding test batch in order
for data, target in test_loader:
# The data needs to be wrapped in another tensor to work with our network,
# otherwise it is not of the appropriate dimensions... I believe this statement effectively
# adds a dimension.
#
# For an explanation of the meaning of this statement, see:
# https://stackoverflow.com/a/42482819/9454504
#
# This code was used here in another experiment:
# https://github.com/kuc2477/pytorch-ewc/blob/4a75734ef091e91a83ce82cab8b272be61af3ab6/utils.py#L75
data = data.view(args.test_batch_size, -1)
# wrap data and target in variables- again, from the following experiment:
# https://github.com/kuc2477/pytorch-ewc/blob/4a75734ef091e91a83ce82cab8b272be61af3ab6/utils.py#L76
#
# .to(device):
# set the device (CPU or GPU) to be used with data and target to device variable (defined in main())
data, target = Variable(data).to(model.device), Variable(target).to(model.device)
# Forward pass: compute predicted output by passing data to the model. Module objects
# override the __call__ operator so you can call them like functions. When
# doing so you pass a Tensor of input data to the Module and it produces
# a Tensor of output data. We have overriden forward() above, so our forward() method will be called here.
output = model(data)
# Define the testing loss to be cross entropy loss based on predicted values (output)
# and ground truth labels (target), calculate the testing batch loss, and sum it with the total testing
# loss over all batches in the given task_number's entire testset (contained within test_loss).
#
# NOTE: size_average = False:
# By default, the losses are averaged over observations for each minibatch.
# If size_average is False, the losses are summed for each minibatch. Default: True
#
# Here we use size_average = False because we want to SUM all testing batch losses and average those
# at the end of testing on the current task (by dividing by total number of testing SAMPLES (not batches) to obtain an
# average loss over all testing batches). Otherwise, if size_average == True, we would be getting average
# loss for each testing batch and then would average those at the end of testing on the current task
# to obtain average testing loss, which could theoretically result in some comparative loss of accuracy
# in the calculation of the final testing loss value for this task.
#
# NOTE:
# <some loss function>.item() gets the a scalar value held in the loss
criterion = nn.CrossEntropyLoss(size_average=False)
test_loss += criterion(output, target).item()
# Get the index of the max log-probability for each of the samples in the testing batch.
#
# output is a 2D tensor of dimensions (test batch size, 10) containing network-predicted probabilities
# that the testing input is an image of each class (digits 0-9, signified by the index of each probability
# in the output tensor for a given test image). That is to say that in the second dimension of output
# the classification probabilities might look like the following for a given image:
# [0.1, 0.1, 0.05, 0.05, 0.2, 0.4, 0.1, 0.0, 0.0, 0.0]
# Because the sixth entry (index 5) contains the maximum value relative to all other indices, the network's
# prediction is that this image belongs to the sixth class- and is therefore the digit 5.
#
# NOTE: torch.max() Returns the maximum value of EACH ROW of the input tensor in the given dimension dim.
# The second return value is the index location of each maximum value found (argmax). This is why we use
# the second return value as the value of the variable pred, because we want the index of the maximum
# probability (not its value)- hence the [1] indexing at the end of the statement.
#
# ARGUMENTS:
#
# Using dimension 1 as the first argument allows us to get the index of the highest valued
# column in each row of output, which practically translates to getting the maximum predicted class
# probability for each sample.
#
# If keepdim is True, the output tensors are of the same size as input except in the dimension dim
# (first argument- in this case 1) where they are of size 1 (because we calculated ONE maximum value per
# row). Otherwise, dim is squeezed (see torch.squeeze()), resulting in the output tensors having 1
# fewer dimension than input.
pred = output.max(1, keepdim=True)[1]
# Check if predictions are correct, and if so add one to the total number of correct predictions across the
# entire testing set for each correct prediction.
#
# A prediction is correct if the index of the highest value in the
# prediction output is the same as the index of the highest value in the label for that sample.
#
# For example (MNIST):
# prediction: [0.1, 0.1, 0.05, 0.05, 0.2, 0.4, 0.1, 0.0, 0.0, 0.0]
# label: [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
#
# This would be a correct prediction- the sixth entry (index 5) in each array holds the highest
# value
#
# In this case, the targets/labels are stored as scalar index values (e.g. torch.Tensor([1, 4, 5])
# for labels for a one, a four, and a five (in that order)
#
# tensor_X.view_as(other) returns a resulting version of tensor_X with the same size as other.size()
#
# torch.eq() -> element wise equality:
# tensor_X.eq(tensor_Y) returns a tensor of the same size as tensor_X with 0's at every index for which
# the entry at that index in tensor_X does not match the entry at that index in tensor_Y and 1's at every
# index for which tensor_X and tensor_Y contain matching values
#
# .sum() sums every row of the tensor into a tensor holding a single value
#
# .item() gets the scalar value held in the sum tensor
correct += pred.eq(target.view_as(pred)).sum().item()
# Divide the accumulated test loss across all testing batches for the current task_number by the total number
# of testing samples in the task_number's testset (in this case, 10,000) to get the average loss for the
# entire test set for task_number.
test_loss /= len(test_loader) * args.test_batch_size
# The overall accuracy of the model's predictions on the task indicated by task_number as a percent
# value is the count of its accurate predictions divided by the number of predictions it made, all multiplied by 100
accuracy = 100. * correct / (len(test_loader) * args.test_batch_size)
test_accuracies.append(accuracy)
# For task_number's complete test set (all batches), display the average loss and accuracy
print('\nTest set {}: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
task_number + 1, test_loss, correct, (len(test_loader) * args.test_batch_size),
accuracy))
if test_accuracies[len(test_accuracies) - 1] < threshold:
return -1 # accuracy minimum threshold not met on most recent task
else:
return test_accuracies # accuracy minimum threshold met
def restore_output_weights(self, task_number):
old_weights = self.task_post_training_weights.get(task_number)
for name, parameter in self.named_parameters():
# final layer weights
if name == 'modulelist.{}.weight'.format(len(self.modulelist) - 1):
parameter.data[...] = \
old_weights[len(old_weights) - 2][tuple(slice(0, n) for n in list(parameter.size()))]
# final layer biases
elif name == 'modulelist.{}.bias'.format(len(self.modulelist) - 1):
parameter.data[...] = \
old_weights[len(old_weights) - 1][tuple(slice(0, n) for n in list(parameter.size()))]
def reinitialize_output_weights(self):
for name, parameter in self.named_parameters():
# final layer weights
if name == 'modulelist.{}.weight'.format(len(self.modulelist) - 1):
torch.nn.init.xavier_uniform(parameter.data)
# final layer biases
elif name == 'modulelist.{}.bias'.format(len(self.modulelist) - 1):
parameter.data.fill_(0.1)