-
Notifications
You must be signed in to change notification settings - Fork 0
/
pipe_line.py
277 lines (206 loc) · 8.21 KB
/
pipe_line.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
"""This module will containg all the functions made for the processes
modelling, training and evaluation. We will definde some important
mathematical functions needed for logistical regression
on multi class problems. As well as call just a few plots to
humanize our models insight"""
# Define a extended sigmoid function for multi classification purposes
# Taken from ML Script for TU Darmstadt and Springer Text Book on Numerical Analysis
# Import packages for module to run
import numpy as np
import matplotlib.pyplot as plt
def sigmoid_softmax(input: np.ndarray) -> np.ndarray:
"""The Softmax function is an extension of the Sigmoid function.
It is commonly used in logistic regressions and expands itself from
binary classification to be able to classify multiple classes. In
multi-class classification algorithms, it take a vector of scores
and transfroms them into a probabiliy distribution over multiple
classes. The function calculates the predicted class as one with the
highest probabilit according to output.
Input: score of real value
Returns:
Output: probability distribution over multiple classes
np.ndarray: probability distribution over softmax function
"""
"""
"""
# Return Maximum along Column
exp_input = np.exp(input - np.max(input, axis=1, keepdims=True))
score = exp_input / np.sum(exp_input, axis=1, keepdims=True)
return score
def cross_entropy(y_predicted: np.ndarray, y_true: np.ndarray) -> np.ndarray | float:
"""_summary_
Define the loss function # Taken from Springer Text Book on
Numerical The Cross Entropy Loss fucntion is used in logistic
regression for multi classification problems. It measures the
deviance between predicted probabilites and true
labels
The goal of our regression will be to minimize the loss represented
by this function. This will help to classify input sampled to its
correct classes Returns:
np.ndarray | int | float: loss function of regerssion
"""
loss = -np.mean(np.sum(y_true * np.log(y_predicted + 1e-8), axis=1))
return loss
def predict(X: np.ndarray, weight: np.ndarray) -> np.ndarray:
"""_summary_
Setup prediction function Self implemented from context
Take X and learned weight from gradient descent to give a
prediction on y
Args:
X (np.ndarray): X matrix of dataset
weight (np.ndarray): learned weights
Returns:
np.ndarray: get back precited classes aka y_predicted
"""
scores = np.dot(X, weight)
probability_dist = sigmoid_softmax(scores)
predicted_class = np.argmax(probability_dist, axis=1)
predicted_class = predicted_class.reshape(-1, 1)
# Return indices from probability
return predicted_class
def accuracy_of_model(y_predicted: np.ndarray, y: np.ndarray) -> np.ndarray | float:
"""_summary_
Setup accuracy function Self implemented from context
Calculate accurarcy of our predictions against true values
Args:
y_predicted (np.ndarray): Predicted y from weights
y (np.ndarray): true y
Returns:
np.ndarray | float: get back comparision array and mean accuracy
"""
acc_pred = y_predicted == y
acc_pred_mean = np.mean(acc_pred)
return acc_pred, acc_pred_mean
def gradient_descent(
X: np.ndarray,
y: np.ndarray,
learning_rate: float = 0.1,
iter: int = 1000,
print_loss: bool = False,
) -> np.ndarray:
"""_summary_
Setup Gradient Descent Algorithm to update and calculate weights
Taken from documentation in sklearn library
Implement gradient descent algorithm to initialize and update
weights for our model
Args:
X (np.ndarray): input training matrix
y (np.ndarray): input OHE target feature
learning_rate (float, optional): _description_. Defaults to 0.1.
iter (int, optional): _description_. Defaults to 1000.
print_loss (bool, optional): _description_. Defaults to False.
Returns:
np.ndarray: weight distribution
list: epoch indexing
list: loss indexing
"""
# Empty list for loss
losses = []
# Get Sample, Feature and Class numbers
num_samples, num_features = X.shape
num_classes = y.shape[1]
# Initialize Parameters
weight = np.random.randn(num_features, num_classes)
# Setup Iteration to update weights
for i in range(iter):
scores = np.dot(X, weight) # Dot product of arrays
# alterntely use @?
probabiliy_dist = sigmoid_softmax(scores)
gradient = np.dot(X.T, probabiliy_dist - y) / num_samples
weight = weight - learning_rate * gradient
loss = cross_entropy(probabiliy_dist, y)
losses.append(loss)
# Print progress and append epoch metrics
if print_loss and (i + 1) % 100 == 0:
# Find accuracy for plot
print(f"Iteration {i+1}, Loss: {loss}")
return weight, losses
def confusion_matrix_plot(y_true: np.ndarray, y_predicted: np.ndarray) -> None:
"""_summary_
Plots a confusion matrix to get true postives, true negatives,
false negatives and false positives densities Courtesy of Nihal
Barua, Thanks for showing me what a confusion matrix is
"""
all_classes = np.unique(np.concatenate((y_true, y_predicted)))
num_classes = len(all_classes)
confusion_matrix = np.zeros((num_classes, num_classes), dtype=int)
for i in range(num_classes):
for j in range(num_classes):
confusion_matrix[i, j] = np.sum((y_true == i) & (y_predicted == j))
plt.figure(figsize=(10, 10))
plt.imshow(confusion_matrix, interpolation="nearest", cmap="Blues")
plt.colorbar()
plt.xticks(np.arange(num_classes), range(num_classes))
plt.yticks(np.arange(num_classes), range(num_classes))
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix on Validation Set")
plt.show()
def loss_plot(loss: list | np.ndarray) -> None:
"""_summary_
Plot Loss over Epochs, showing progression
Args:
loss (list): How accurate ie how low is our losses
"""
plt.plot(loss)
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Model Loss over Epochs")
plt.show()
def compare_accuracy(accuracy_test: float, acuracy_train: float) -> None:
"""_summary_
Plot Loss over Epochs, showing progression
Args:
accuracy (list): How accurate ie how high our model accuracy is
"""
labels = [
"Accuracy of Predicition on Test Data",
"Accuracy of Prediction on Train Data",
]
values = [accuracy_test, acuracy_train]
plt.bar(labels, values)
plt.ylabel("Accuracy in %")
plt.title("Comparison of Accuracies")
plt.show()
def model(
X_train: np.ndarray,
y_train: np.ndarray,
X_valid: np.ndarray,
y_valid_test: np.ndarray,
y_valid_train: np.ndarray,
iter: int = 1000,
learning_rate: float = 0.1,
print_loss: bool = False,
) -> None:
"""_summary_
Bring everything together into model function to train,
evalute/validate and represent/infer
"""
# Gradient descent to retrieve trained parameters
weight, model_losses = gradient_descent(
X_train, y_train, learning_rate, iter, print_loss
)
# Predict
y_predicted_train = predict(X_train, weight)
y_predicted_test = predict(X_valid, weight)
# Accuracy Inspection
acc_train_compare, acc_train_mean = accuracy_of_model(
y_predicted_train, y_valid_train
)
acc_test_compare, acc_test_mean = accuracy_of_model(y_predicted_test, y_valid_test)
# As Percentages
acc_test = acc_test_mean * 100
acc_train = acc_train_mean * 100
print(f"The prediction accuracy of the training dataset is {acc_train}%")
print("The comparision array is as follows:")
print(acc_train_compare)
print(f"The prediction accuracy of the validation dataset is {acc_test}%")
print("The comparision array is as follows:")
print(acc_test_compare)
# Compare accuracies on dataset
compare_accuracy(acc_test, acc_train)
# Plot Visualisations
# Plot Loss Function
loss_plot(model_losses)
# Plot Confusion Matrix
confusion_matrix_plot(y_valid_test, y_predicted_test)