-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNN.py
365 lines (261 loc) · 13 KB
/
NN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
import scipy
import numpy as np
import logging
import root_numpy as rnp
from sklearn import neural_network as nn
import time
import matplotlib.pyplot as plt
import tqdm
import os
import shutil
from sklearn.preprocessing import StandardScaler
start_time =time.time()
logging.basicConfig(level = logging.info)
""" The logging library allows me to recieve outputs faster than the print function for debugging. At the moment I have removed most of the debug outputs, and left in the info outputs,
but should I want too, I can have everything export into a log file."""
def loader(file_type, data):
test_sig = "/phys/users/arbo94/Decision_Tree/Inputs/testingsignalII.root"
train_sig = "/phys/users/arbo94/Decision_Tree/Inputs/trainingsignalII.root"
test_bkg = "/phys/users/arbo94/Decision_Tree/Inputs/testingbkg.root"
train_bkg = "/phys/users/arbo94/Decision_Tree/Inputs/trainingbkg.root"
uncut = "/phys/users/arbo94/Decision_Tree/Inputs/Hv_Uncutrad_signal.root"
branchNames =["Event_Num","Jet_Num","Jet_Pt","Cal_Ratio","Track_Count"]
if (file_type == "train"):
if (data == "sig"):
array = rnp.root2array(train_sig, "Training_Variables", branchNames)
return array
elif(data == "bkg"):
array = rnp.root2array(train_bkg, "Training_Variables", branchNames)
return array
else:
logging.info("Not a valid data option.")
elif (file_type == "test"):
if (data == "sig"):
array = rnp.root2array(test_sig, "Training_Variables", branchNames)
return array
elif(data == "bkg"):
array = rnp.root2array(test_bkg, "Training_Variables", branchNames)
return array
elif(data =="uncut"):
array = rnp.root2array(uncut, "Training_Variables", branchNames)
return array
else:
logging.info("Not a valid data option.")
else:
logging.info("Not a valid Train/Test option")
""" The loader function here is to take a root file, and put it into a numpy structured array using the root_numpy libary [emphasis on the fact it is a structured array].
The basic set up of the function is:
array = rnp.root2array(File_location, Name_of_TTree, List_of_branch_names). All values should be strings, except of the list, which should be a list of strings.
The huge if/else is to make typing it out below easier."""
def cuts(array):
primer = []
for event in tqdm.tqdm(array):
if (event[1] >=25000):
primer.append(event)
else:
pass
return primer
""" This function is to add our Pt cut of 25 GeV]. This could be done with splicing, but I am not comfortable with splicing, hence this horrendous for loop."""
def reshape(array):
primer = np.column_stack((array["Cal_Ratio"],array["Jet_Pt"],array["Track_Count"]))
return primer
"""This function removes event number and jet number to not confuse the Tree, as the classifier cant hide variables from being ran,(unlike TMVA)"""
''' Loading arrays '''
logging.info("Loading Arrays from .root file")
logging.info("trainingsignal.root")
train_sig = loader("train","sig")
logging.info("trainingbkg.root")
train_bkg = loader("train","bkg")
logging.info("testsignal.root")
test_sig = loader("test","sig")
logging.info("testingbkg.root")
test_bkg = loader("test","bkg")
logging.info("uncut rootfile")
uncut = loader("test","uncut")
logging.info("Loading completed")
''' Reshaping Arrays'''
logging.info("Reshaping Arrays for correct input")
train_sig = reshape(train_sig)
logging.info("Done with Train_sig")
test_sig= reshape(test_sig)
logging.info("Done with test_sig")
train_bkg= reshape(train_bkg)
logging.info("Done with train_bkg")
test_bkg = reshape(test_bkg)
logging.info("Done with test_bkg")
uncut = reshape(uncut)
logging.info("Done with uncut")
logging.info("Applying cuts to data")
train_sig = cuts(train_sig)
test_sig = cuts(test_sig)
train_bkg = cuts(train_bkg)
test_bkg= cuts(test_bkg)
uncut = cuts(uncut)
""" Loading array and reshaping array loads the arrray into numpy, reshapes the array, then applies the cut."""
'''Classification Array '''
logging.info("Creating classification Arrays")
sig_class = [1]*len(train_sig)
bkg_class = [0]*len(train_bkg)
class_array = sig_class + bkg_class
class_array = np.array(class_array).astype(np.float32)
""" This grouping of functions creates a classification array, to determine whether an event is signal, or background. 1 is signal, 0 is background events """
'''Standardizing Data'''
logging.info("Standardizing Data")
#From the tips on pratical use section of scikit NN documentation, this is for standardizing the data. ONLY FIT ON TRAINING DATA
training_array = np.concatenate((train_sig, train_bkg), axis = 0)
scaler = StandardScaler()
scaler.fit(training_array)
training_array = scaler.transform(training_array)
uncut = scaler.transform(uncut)
test_bkg = scaler.transform(test_bkg)
""" This standaradization step is very important for neural networks within scikit. The standard scalar function creates a normalized set of data. Sadly I forget which version of
normalization it uses, but this is important. It takes in all of the data for training, both signal and background, to create the scalar. Then you apply it to the test data with the
transform method. """
''' Creating Classifier '''
logging.info("Creating Neural Net")
batch_size = len(class_array)
NN = nn.MLPClassifier()
#NN.set_params(hidden_layer_sizes=(5,))
#NN.set_params(alpha = 1)
NN.set_params(hidden_layer_sizes=(5,), activation ="logistic", solver="sgd", batch_size=batch_size,
learning_rate="adaptive", learning_rate_init = .02, max_iter= 600, random_state = None, shuffle = True, momentum =.01,
nesterovs_momentum = False)
"""This step is creating the classifier, with the paramaters I would like. The Hidden layer size is odd, because you end the dictionary with an empty entry. What (5,) is one hidden layer
with 5 nodes. (5,5,) would be 2 hidden layers, with 5 nodes each. This hidden layers do not include the input and output layers. The activation function is logistic.
This means that activating the node runs on a unit logistic function. SGD is one of the many training methods Scikit uses. After many different tests, I have determine this
is the most accurate for our data. The rest can be read from the documentation on scikits website. """
'''Training Classifier'''
logging.info("Time to train")
train_start = time.time()
NN.fit(training_array,class_array)
train_end = time.time()
print("Signal training took:",(train_end- train_start)/60, "minutes")
""" Fit is the training function in scikitlearn """
'''Saving Training Classifier'''
logging.info("Saving Classifier")
from sklearn.externals import joblib
if(os.path.isdir("NNClassifier4")==True):
shutil.rmtree("NNClassifier4")
os.mkdir("NNClassifier4")
else:
os.mkdir("NNClassifier4")
joblib.dump(NN, "NNClassifier4/NNClassifier4.pkl")
""" This set of code saves the classifier as a special pickle type to be used later on as needed"""
'''Saving Scaler'''
logging.info("Saving Scaler")
if(os.path.isdir("NNScaler4")==True):
pass
else:
os.mkdir("NNScaler4")
joblib.dump(scaler,"NNScaler4/NNScaler4.pkl")
""" This set of code saves the scalar as a special pickle for later use as needed"""
'''Predictions'''
logging.info("Predicting")
predict_sig = NN.predict(uncut)
predict_bkg = NN.predict(test_bkg)
logging.info("Probability Predicting")
prob_sig = NN.predict_proba(uncut)
prob_bkg = NN.predict_proba(test_bkg)
""" The predict method takes the testing data, and outputs whether or not it is a "0" or a "1". I.e, is it a signal, or background. The outputs are only 0 or 1. No inbetween.
predict_proba is similar to the predict method, but outputs an array, with columns equal to the number of possible outputs in predicit. In this instance, the array is nx2,
where n = # of events. Each column shows the probability that it is that type of classification. For us, the first column is the probability it is a 0, or background, and the
second column is hte probability it is a signal event, or 1. Adding these two numbers should =1. """
'''ROC Curve'''
logging.info("Creating ROC Curve")
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
merged_prob = np.concatenate((prob_sig[:,1],prob_bkg[:,1]),axis=0)
#####################################################
# #
# NOTE: [:,1] is for all of the second column of #
# proba_predict function's output. we are #
# sending the probabilities of the event being #
# a signal event through the ROC Curve #
# #
#####################################################
class_prob_sig = [1]*len(prob_sig[:,1])
class_prob_bkg = [0]*len(prob_bkg[:,1])
merged_prob_class = class_prob_sig + class_prob_bkg
fpr_sig, tpr_sig, thresholds_sig = roc_curve(merged_prob_class, merged_prob)
auc_score= roc_auc_score(merged_prob_class, merged_prob)
text = "AUC score is: "+ str(auc_score)
""" The purpose of this group of code is to create the data necessary for a ROC curve. The method to create the roc curve data is as follows:
false_positive, true_positives, thresholds = roc_curve(classification_array, array_of_test_data). This only works because our "test" date is actually known to us.
This is how we are able to create the ROC curve. All the code above that line is just creating the arrays necessary. the AUC score, or area under the curve
gives you a single number. This number is considered the "accuaracy" of the classifier, which takes into account how well a classifier can identify a true event, and identify
a background event.
A ROC curve is made with the x axis as the flase positive arrays, and the y axis as the true positive arrays. """
''' Making Histograms and Trees'''
logging.info("Making Histograms")
fig = plt.figure()
plot1 = fig.add_subplot(1,1,1)
plot1.hist(predict_sig, histtype ="bar", log = True)
plot1.set_title("NN Response (Signal)")
plot1.set_xlabel("Response (0 = Background, 1= Signal)")
plot1.set_ylabel("# of Events")
fig2 = plt.figure()
plot2 = fig2.add_subplot(1,1,1)
plot2.hist(predict_bkg, histtype ="bar", log = True)
plot2.set_title("NN Response (Background)")
plot2.set_xlabel("Response (0 = Background, 1= Signal)")
plot2.set_ylabel("# of Events")
fig3 = plt.figure()
plot3 = fig3.add_subplot(1,1,1)
plot3.hist(prob_sig[:,0], histtype ="bar", log = True)
plot3.set_title("NN Probability of event being bkg(sig)")
plot3.set_xlabel("Probability of Being Background")
plot3.set_ylabel("# of Events")
fig4 = plt.figure()
plot4 = fig4.add_subplot(1,1,1)
plot4.hist(prob_sig[:,1], histtype ="bar", log = True)
plot4.set_title("NN Probability of event being signal(sig)")
plot4.set_xlabel("Probability of Being Signal")
plot4.set_ylabel("# of Events")
fig5 = plt.figure()
plot5 = fig5.add_subplot(1,1,1)
plot5.hist(prob_bkg[:,0], histtype ="bar", log = True)
plot5.set_title("NN Probability of event being bkg(bkg)")
plot5.set_xlabel("Probability of Being Background")
plot5.set_ylabel("# of Events")
fig6 = plt.figure()
plot6 = fig6.add_subplot(1,1,1)
plot6.hist(prob_bkg[:,1], histtype ="bar", log = True)
plot6.set_title("NN Probability of event being signal(bkg)")
plot6.set_xlabel("Probability of Being Signal")
plot6.set_ylabel("# of Events")
fig7 = plt.figure()
plot7 = fig7.add_subplot(1,1,1)
plot7.plot(fpr_sig,tpr_sig,"b", label = "ROC")
plot7.set_title("Reciever Operating Characteristic for Neural Net(w/ Uncut Data)")
plot7.set_xlabel("False Positve Rate")
plot7.set_ylabel("True Positive Rate")
plot7.plot([0,1],[0,1],"r--", label = "Random Guess Line")
plot7.set_xlim([0,1])
plot7.set_ylim([0,1])
plot7.grid(True)
plot7.legend(bbox_to_anchor=(0.46,1))
plot7.annotate(text, xy = (0.5,0.1))
""" These 7 graphs are confusing, but they are described as follows. All plots except for 7 are log graphs for the Y axis:
1. This plots the outputs the results from predict purely on the signal data.
2. This plots the outputs of predict purely on the background data.
3. This plots the probabilities of events that are known to be signal to be a background event from predict_proba
4. This plots the probabilities of events that are known to be signal to be a signal event from predict_proba
5. This plots the probabilities of events that are known to be background to be a background event from predict_proba
6. This plots the probabilities of events that are known to be background to be a signal event from predict_proba
7. This plots the ROC curve, with the AUC score written on it. There is a Random guess line, which shows what the ROC score would be if the classifier would just randomly
guess whether the even was signal or background """
if(os.path.isdir("graphs4")==True):
pass
else:
os.mkdir("graphs4")
save = fig.savefig("graphs4/NN_Predict_Sig.png")
save2 = fig2.savefig("graphs4/NN_Predict_Bkg.png")
save3 = fig3.savefig("graphs4/NN_sig_col0.png")
save4 = fig4.savefig("graphs4/NN_sig_col1.png")
save5 = fig5.savefig("graphs4/NN_bkg_col0.png")
save6 = fig6.savefig("graphs4/NN_bkg_col1.png")
save7 = fig7.savefig("graphs4/NN_ROC_Curve.png")
end_time = time.time()
print("completed in", (end_time - start_time)/60, "minutes")
plt.draw()
""" This set of codes saves the graphs as PNG's, and plt.draw should show the pictures up on your screen. """