-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
114 lines (94 loc) · 3.56 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import argparse
import torch
import torch.utils.data
from torch import nn, optim
from torch.nn import functional as F
import pandas as pd
from commit import Commit
import random
import network
from pycm import ConfusionMatrix
CSV_FILENAME = "dataset.csv"
CSV_FILE_DELIMITER = "#"
EPOCHS_COUNT = 40
BATCH_SIZE = 1
TRAINING_DATASET_SIZE_RATIO = 0.85
LEARNING_RATE = 0.00005
def get_batch_features_tensor(index, batch_size=BATCH_SIZE):
return torch.tensor([training_dataset[i].get_all_features_list() for i in range(index, index + batch_size)])
def get_batch_labels_tensor(index, batch_size=BATCH_SIZE):
return torch.tensor([training_dataset[i].get_labels_list() for i in range(index, index + batch_size)])
def prepare_training_and_validation_datasets(training_set_size_ratio=TRAINING_DATASET_SIZE_RATIO, batch_size=BATCH_SIZE):
global training_dataset
global validation_dataset
global training_dataset_size
global validation_dataset_size
global batches_count
random.shuffle(commits)
training_dataset_size = (int(len(commits) * training_set_size_ratio) // batch_size) * batch_size
training_dataset = commits[:training_dataset_size]
validation_dataset = commits[training_dataset_size:]
validation_dataset_size = len(validation_dataset)
batches_count = training_dataset_size // batch_size
def read_commits_data_from_csv(csv_filename=CSV_FILENAME, csv_file_delimiter=CSV_FILE_DELIMITER):
global commits
commits = []
csv_contents = pd.read_csv(csv_filename, delimiter=csv_file_delimiter)
rows = len(csv_contents)
cols = len(csv_contents.iloc[0])
for i in range(rows):
data = csv_contents.iloc[i]
commit = Commit(data["commitId"], data["project"], data["comment"], data["label"], data[4:])
commits.append(commit)
Commit.prepare_text_vectorizer()
def initialize_network():
global device
global model
global optimizer
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = network.Network().to(device)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
# for m in model._modules:
# network.normal_init(model._modules[m], 0, 1)
def validate():
global input_labels
global predicted_labels
input_labels = []
predicted_labels = []
model.eval()
total_true_positives = 0
with torch.no_grad():
for j in range(validation_dataset_size):
network_output = model(validation_dataset[j].get_all_features_tensor().to(device))
network_output_index = int(network_output.argmax(dim=0, keepdim=False))
if network_output_index == validation_dataset[j].get_label():
total_true_positives += 1
input_labels.append(validation_dataset[j].get_label())
predicted_labels.append(network_output_index)
accuracy = total_true_positives / validation_dataset_size
print("Accuracy:", accuracy)
def print_confusion_matrix(input_labels, predicted_labels):
cm = ConfusionMatrix(input_labels, predicted_labels)
print(cm)
def train_all(epochs_count=EPOCHS_COUNT, batch_size=BATCH_SIZE):
model.train()
for i in range(epochs_count):
training_loss = 0
for j in range(batches_count):
network_output = model(get_batch_features_tensor(j).to(device))
optimizer.zero_grad()
loss_value = network.loss(network_output, get_batch_labels_tensor(j))
training_loss += loss_value.item()
loss_value.backward()
optimizer.step()
training_loss /= batches_count
print("Epoch number:", i + 1)
print("Training loss:", training_loss)
validate()
print("---------------------")
if __name__ == "__main__":
initialize_network()
read_commits_data_from_csv()
prepare_training_and_validation_datasets()
train_all()
print_confusion_matrix(input_labels, predicted_labels)