-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_dataloader.py
executable file
·313 lines (201 loc) · 14.6 KB
/
train_dataloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
'''
Team 17 - CS 7641 ML Project Fall 2021
"Visual Reasoning for the Visually Impaired"
Authors: Angana Borah, Devshree Bharatia, Dimitri Adhikary, Megha Thukral, Yusuf Ali
Script Summary: Perform training and evaluation on the filtered VizWiz dataset
'''
import copy
import glob
import os
import argparse
import numpy as np
import torch
import torch.nn as nn
import torch.utils.data as data
import torchvision
import torch.nn.functional as F
import matplotlib.pyplot as plt
import sys
import pickle as pk
import tensorflow as tf
import pickle, ast
import pandas as pd
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm
import models
import dataset
def calculate_loss(MLP_output, local_labels, LEN_TRAIN_SET = 0, LEN_VAL_SET = 0):
'''
Function which returns the average loss value for all the samples in a training/validation batch based on the paper: https://arxiv.org/pdf/1708.00584.pdf
Implementation taken from the PyTorch VizWiz GitHub website: https://github.com/DenisDsh/VizWiz-VQA-PyTorch/
Args:
MLP_output: Tensor of size (batch_size, 3000) - batch output of final layer of the MLP netowrk
local_labels: Tensor of size (batch_size,3000) - batch of label vectors for corresponding MLP output
LEN_TRAIN_SET: int - size of training set (default value = 0 ---> so if calculate_loss() is called from evaluate(), the value of LEN_TRAIN_SET = 0 by default)
LEN_VAL_SET: int - size of validation set (default value = 0 ---> so if calculate_loss() is called from train(), the value of LEN_VAL_SET = 0 by default)
Returns:
loss: float - average loss value for the batch
'''
log_softmax = nn.LogSoftmax(dim = 1).cuda()
nll = -1 * log_softmax(MLP_output) # calculate log softmax of the MLP_output
loss = (nll * local_labels / 10).sum(dim = 1).mean() # calculate soft cross-entropy loss as defined in https://arxiv.org/pdf/1708.00584.pdf
if LEN_TRAIN_SET != 0: # if condition to check whether the loss function was called from train() or evaluate() and apply normalistion based on that
loss = loss / LEN_TRAIN_SET # Normalise the training loss
elif LEN_VAL_SET != 0:
loss = loss / LEN_VAL_SET # Normalise the validation loss
else:
print("\n Cannot calculate loss !")
return loss
def train(epoch, train_loader, model, optimizer, tb, GPU_DEVICE, LEN_TRAIN_SET):
'''
Perform training on the training dataset and save loss values for tensorboard visualisation if required
Args:
epoch: index of the current epoch (int)
train_loader: training set dataloader (PyTorch dataloader)
model: MLP network (derived class torch.nn.Module)
optimizer: Adam optimiser (torch.optim.Adam instance)
tb: TensorBoard summary writer object
GPU_DEVICE: name of GPU device
LEN_TRAIN_SET: size of training set
Return:
None
'''
with tqdm(train_loader, unit="batch") as tepoch_train: # tqdm wrapper for visualisation in command terminal
model.train() # set the model into "training" mode
train_loss = 0
for local_batch, local_labels in tepoch_train:
if torch.cuda.is_available(): # Shift the batched data and labels onto GPU for faster computation, if GPU is available
local_batch, local_labels = local_batch.to(GPU_DEVICE), local_labels.to(GPU_DEVICE)
local_batch = local_batch.float() # Convert the batched data and labels to float type as it is required for gradient computation
local_labels = local_labels.float()
local_batch = local_batch.requires_grad_(True) # Ensure batched data and labels have associated gradient tensors which will be updated in the backward propation step
local_labels = local_labels.requires_grad_(True)
optimizer.zero_grad() # zero out the gradients (for more details refer to : https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
MLP_output = model(local_batch) # forward pass of the SimpleNet() MLP network
loss = calculate_loss(MLP_output, local_labels, LEN_TRAIN_SET) # calculate train loss
loss.requires_grad_(True) # Ensure loss variable has an associated gradient tensor so that it can be used in the backprop step and is appended to the dynamic computation graph used by PyTorch for auto-differentiation
loss.backward() # Perform backprop
optimizer.step() # Update the parameters based on Adam optimiser update rule
# acc = vqa_accuracy(out.data, a.data).cpu()
train_loss += loss.item()
print("\n Epoch ", epoch + 1, " Train Loss : ", train_loss)
if tb is not None:
tb.add_scalar('Train_Loss', train_loss, epoch) # Save the train loss value for current epoch for TensorBoard visualisation
def evaluate(epoch, val_loader, model, tb, GPU_DEVICE, LEN_VAL_SET):
'''
Perform evaluation of teh trained model on the validation dataset and save loss values for tensorboard visualisation if required
Args:
epoch: index of the current epoch (int)
val_loader: validation set dataloader (PyTorch dataloader)
model: MLP network (derived class torch.nn.Module)
tb: TensorBoard summary writer object
GPU_DEVICE: name of GPU device
LEN_VAL_SET: size of validation set
Return:
None
'''
with tqdm(val_loader, unit="batch") as tepoch_val: # tqdm wrapper for visualisation in command terminal
model.eval() # set the model into "evaluation" mode
val_loss = 0
with torch.no_grad():
for local_batch, local_labels in tepoch_val:
if torch.cuda.is_available(): # Shift the batched data and labels onto GPU for faster computation, if GPU is available
local_batch, local_labels = local_batch.to(GPU_DEVICE), local_labels.to(GPU_DEVICE)
local_batch = local_batch.float() # Convert the batched data and labels to float type as it is required for gradient computation
local_labels = local_labels.float()
MLP_output = model(local_batch) # forward pass of the SimpleNet() MLP network
loss = calculate_loss(MLP_output, local_labels, LEN_VAL_SET) # calculate validation loss
val_loss += loss.item()
print("\n Epoch ", epoch + 1, " Val Loss : ", val_loss)
if tb is not None:
tb.add_scalar('Val_Loss', val_loss, epoch) # Save the validation loss value for current epoch for TensorBoard visualisation
def vqa_accuracy(predicted, true):
'''
Copied the function from the VizWiz PyTorch GitHub website: https://github.com/DenisDsh/VizWiz-VQA-PyTorch/
Not sure if this is correct - so not using this function right now
'''
print("\n Preicted Shape: ", predicted.shape)
print("\n True Shape: ", true.shape)
# _, predicted_index = predicted.max(dim=1, keepdim=True)
# agreeing = true.gather(dim=1, index=predicted_index)
_, predicted_index = torch.max(predicted)
agreeing = true.gather(dim=1, index = predicted_index)
return (agreeing * 0.33333).clamp(max=1)
def main(NUM_EPOCHS, BATCH_SIZE, LEARNING_RATE, TRAIN_SAMPLES, VAL_SAMPLES, DATA_PATH, SAVE_TENSORBOARD_VAL):
'''
Main function which detects GPU, loads VizWiz PyTorch dataset and dataloaders, performs training and evaluation and saves loss values for tensorboard visuaisation (if required)
Args:
NUM_EPOCHS: number of epochs to train for (int)
BATCH_SIZE: number of data samples in one batch of training (int)
LEARNING_RATE: learnign rate for Adam optimsier (float)
TRAIN_SAMPLES: size of training set (int)
VAL_SAMPLES: size of validation set (int)
DATA_PATH: absolute path of pkl_files directory (str)
SAVE_TENSORBOARD_VAL: flag variable which decides if tensorbaord visualisation is required or not (str: 'yes'/ 'no')
Returns:
None
'''
print("\n Is GPU Available: ", torch.cuda.is_available()) # Check if GPU is available
gpu_ids = []
if torch.cuda.is_available():
gpu_ids += [gpu_id for gpu_id in range(torch.cuda.device_count())]
device = torch.device(f'cuda:{gpu_ids[0]}')
print("\n Device Name: ", device)
GPU_DEVICE = device # Assign identified GPU device to "GPU_DEVICE" variable which will be used to transfer computation to GPU
if SAVE_TENSORBOARD_VAL == 'yes':
SAVE_TENSORBOARD_VAL = True # "SAVE_TENSORBOARD_VAL" is flag variable which determiens if train/val losses will be saved for visualisation or not
else:
SAVE_TENSORBOARD_VAL = False
# -------------------------------------------------------------------------------------------------------------------
# Setup PyTorch Dataset and Dataloader from the filtered VizWiz Dataset
print("\n Loading the Dataset ...")
VizWiz_dataset = dataset.VizWiz_VQA_Dataset(DATA_PATH) # Load the PyTorch dataset (refer to dataset.py for more details) by loading the pkl files from the directory entered in the terminal
print("\n Dataset Loaded !")
print("\n Creating Training and Validation splits ...")
train_set, val_set = random_split(VizWiz_dataset, [TRAIN_SAMPLES, VAL_SAMPLES]) # Randomly split the loaded dataset into training set and validation | No.of samples in each set is specified in command terminal while running the script
print("\n Training and Validation splits created !")
print("\n Creating PyTorch dataloaders for training and validation sets ...")
train_loader = DataLoader(train_set, batch_size = BATCH_SIZE, shuffle = True, num_workers = 4) # Create training PyTorch dataloader (for setting up batched_data) using the training set loaded earlier
val_loader = DataLoader(val_set, batch_size = BATCH_SIZE, shuffle = True, num_workers = 4) # Create validation PyTorch dataloader (for setting up batched_data) using the validation set loaded earlier
print("\n PyTorch dataloaders for training and validation sets created !")
LEN_TRAIN_SET = len(train_set) # Number of samples in training set
LEN_VAL_SET = len(val_set) # Number of samples in validation set
# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# Instantiate the SimpleNet() MLP fully connected network, Adam optimiser
print("\n Setting up Deep Network...")
model = models.SimpleNet()
print("\n Deep Network instantiated !")
if torch.cuda.is_available():
model.to(GPU_DEVICE) # Transfer model to gpu for fast computation, if GPU is available
print("\n Model transferred to GPU !")
else:
print("\n Model attached to CPU as GPU not found !")
print("\n Setting up Adam optimiser...")
optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)
print("\n Optimiser instantiated !")
# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# Perform training and save epoch loss values for tensorboard visualisation (if required)
if SAVE_TENSORBOARD_VAL == True:
tb = SummaryWriter() # Create the tensorbaord "Summary Writer" object which stores train and val losses for visualisation
else:
tb = None
print("\n Starting traing and evaluation...")
for epoch in (range(NUM_EPOCHS)):
train(epoch, train_loader, model, optimizer, tb, GPU_DEVICE, LEN_TRAIN_SET) # Perform training
evaluate(epoch, val_loader, model, tb, GPU_DEVICE, LEN_VAL_SET) # Perform validation
if SAVE_TENSORBOARD_VAL == True:
tb.close()
torch.cuda.empty_cache() # Clear the GPU cache
print("\n Training and Validation done !")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--num_epochs', required = True, type = int, help = "Enter the number of epochs to train for")
parser.add_argument('--batch_size', required = True, type = int, help = "Enter the batch_size to be used for training")
parser.add_argument('--learning_rate', required = True, type = float, help = "Enter the learning_rate to be used for training")
parser.add_argument('--train_samples', required = True, type = int, help = "Enter number of samples used in train set out of 13108 (train_samples + val_samples = 13108)")
parser.add_argument('--val_samples', required = True, type = int, help = "Enter number of samples used in val set out of 13108 (train_samples + val_samples = 13108)")
parser.add_argument('--path_pkl_files', type = str, help = "Enter the absolute directory location of the pkl_files folder")
parser.add_argument('--tensor_board_viz', type = str, help = "Is tensorboard visualisation required for train and val losses: Yes/No ")
args = parser.parse_args()
main(args.num_epochs, args.batch_size, args.learning_rate, args.train_samples, args.val_samples, args.path_pkl_files, args.tensor_board_viz)