-
Notifications
You must be signed in to change notification settings - Fork 1
/
proj1.py
319 lines (260 loc) · 11.4 KB
/
proj1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
# Author: Hitesh Bhavsar, Jose Gallardo, Amish Mathur, Mike Mai
# This file contains all the code required for the project: from reading an input .csv file to building DTrees and giving the classification of the decision tree based on thier computed accurracies.
# All functions in this file also have comments explaining in detail the purpose of thier use and what they intended to do when called given the correct parameters.
import random
import math
# Each tree node consists of:
# 1. a vector list => [([feature vector], class), ..]
# 2. the selected attribute to split the vector list
# 3. its children => {selected attribute value : node, ..}
class node:
def __init__(self):
self.vecs = list()
self.attr_split = None
self.children = dict()
# function to read data file
# parameter: data file name or path to data file
# return: list of featre vectors and its corresponding class pair => [([feature_vector], class)..]
def read_file(filename):
fea_vecs = []
file = open(filename, "r")
ID = 0
for line in file:
values = line.split(",")
fea_vecs.append((values[:6], values[6].replace("\n", ""), ID))
ID += 1
return fea_vecs
feature_vecs = []
attributes = ["White King file", " White King rank", "White Rook file", "White Rook rank", "Black King file", "Black King rank"]
feature_vecs = read_file("550-p1-cset-krk-1.csv")
Training_Set = []
Holdout_Set= []
Validation_Set = []
def sets_generator():
total_data = len(feature_vecs)
data_indexes = list(range(total_data))
total_training_set_data = int(total_data * (.60));
remaining_data = total_data - total_training_set_data
if remaining_data % 2 == 1:
total_training_set_data += 1
remaining_data - 1
Training_indexes = random.sample(data_indexes, total_training_set_data)
for element in Training_indexes:
Training_Set.append(feature_vecs[element])
data_indexes.remove(element)
Holdout_indexes = random.sample(data_indexes, int(remaining_data/2))
for element in Holdout_indexes:
Holdout_Set.append(feature_vecs[element])
data_indexes.remove(element);
Validation_indexes = list(data_indexes);
for element in Validation_indexes:
Validation_Set.append(feature_vecs[element])
data_indexes.remove(element)
def entropy(class_occurrence, total):
node_entropy = 0
for minwindepth in class_occurrence:
class_probability = class_occurrence[minwindepth] / total
node_entropy += -class_probability*math.log(class_probability,2)
return node_entropy
# Formula to find information gain
def FindInfoGain(entropy_set,entropy_attr,probab):
return entropy_set-sum([entropy_attr[i]*probab[i] for i in range(len(entropy_attr))])
# Select best attribut to split a node
# Take as input a tree node
# When finish, tree node's property is modified accordingly
def split_node(root, attrs):
# Calculate root entropy
class_occurrence = dict()
for pair in root.vecs:
if pair[1] in class_occurrence:
class_occurrence[pair[1]] += 1
else:
class_occurrence[pair[1]] = 1
total = len(root.vecs)
root_ent = entropy(class_occurrence, total)
print("root entropy:", root_ent)
# Calculate average entropy for each attribute value
info_gain = list()
for i in range(len(root.vecs[0][0])):
class_occurence = dict()
total = dict()
for pair in root.vecs:
if pair[0][i] not in class_occurence:
class_occurence[pair[0][i]] = dict()
total[pair[0][i]] = 1
else:
total[pair[0][i]] += 1
if pair[1] in class_occurence[pair[0][i]]:
class_occurence[pair[0][i]][pair[1]] += 1
else:
class_occurence[pair[0][i]][pair[1]] = 1
entropy_list = list()
for attr_val in class_occurence:
entropy_list.append((total[attr_val], entropy(class_occurence[attr_val], total[attr_val])))
avg_ent = 0
for ent in entropy_list:
avg_ent += ent[0] / len(root.vecs) * ent[1]
info_gain.append(avg_ent)
print("Attribute " + attrs[i] + "'s average entropy:", avg_ent)
# Calculate information gain for each attribute
print()
max_info = (0, 0)
for i in range(len(info_gain)):
info_gain[i] = root_ent - info_gain[i]
if info_gain[i] > max_info[1]:
max_info = (i, info_gain[i])
print("Attribute " + attrs[i] + "'s information gain:", info_gain[i])
print("Attribute " + attrs[max_info[0]] + " has the greatest information gain, so it is selected as the attribute to split\n")
# check if the selected attribute value produce the same size child
same_attr_val = True
max_info_index = max_info[0]
while same_attr_val:
val = root.vecs[0][0][max_info_index]
for pair in root.vecs:
if pair[0][max_info_index] != val:
same_attr_val = False
if same_attr_val:
print("\n\n", max_info_index, val, root.vecs, "\n\n")
max_info_index += 1
# Updatte root node properties
root.attr_split = max_info_index
for pair in root.vecs:
if pair[0][root.attr_split] not in root.children:
root.children[pair[0][root.attr_split]] = node()
root.children[pair[0][root.attr_split]].vecs.append(pair)
# check whether a node is a leaf node
def isLeaf(root):
pair1 = root.vecs[0][1]
for pair in root.vecs:
if pair[1] != pair1:
return False
return True
# build a decision tree using the provided root node
def buildDTree(root):
queue = [root]
while len(queue) > 0:
curr = queue.pop(0)
split_node(curr, attributes)
for child in curr.children:
if not isLeaf(curr.children[child]):
queue.append(curr.children[child])
else:
print(str(attributes[curr.attr_split]) + " attribute has leaf node " + str(child) + " with class: " + str(curr.children[child].vecs[0][1]))
# classified the provided vector using the provided rooted tree
def classifier(root, feat_vec):
curr = root
while not isLeaf(curr):
if feat_vec[0][curr.attr_split] in curr.children:
curr = curr.children[feat_vec[0][curr.attr_split]]
else:
class_occurence = dict()
for pair in curr.vecs:
if pair[1] in class_occurence:
class_occurence[pair[1]] += 1
else:
class_occurence[pair[1]] = 1
max = (0, "")
for c in class_occurence:
if class_occurence[c] > max[0]:
max = (class_occurence[c], c)
return max[1]
return curr.vecs[0][1]
def accuracy(tree_root, v_set):
correct = 0;
incorrect_indexes = []
i = 0
print("Incorrect classification for vectors: ")
for pair in v_set:
if (classifier(tree_root, pair) == pair[1]):
correct += 1
else:
incorrect_indexes.append(i)
print(str(pair[2]) + " ", end='')
i += 1
print("")
return correct / len(v_set), incorrect_indexes
def bagging_replacement(root, t_set, holdout_set):
union_set = t_set + holdout_set
final_t_set = []
final_holdout_set = []
for i in accuracy(root, holdout_set)[1]:
union_set.append(holdout_set[i])
union_set.append(holdout_set[i])
for i in t_set:
add_index = random.randint(0, len(t_set) - 1)
final_t_set.append(union_set[add_index])
# remove duplicates before removing items in final_t_set
for item in union_set:
if item not in final_holdout_set:
final_holdout_set.append(item)
for item in final_t_set:
if item in final_holdout_set:
final_holdout_set.remove(item)
return final_t_set, final_holdout_set
def print_ids(data_set):
for vector in data_set:
print(str(vector[2]) + " ", end='')
print("")
'''
attributes = ["crust size", "shape", "filling size"]
root = node()
root.vecs = [(["big", "circle", "small"], "pos"),(["small", "circle", "small"], "pos"),(["big", "square", "small"], "neg"),(["big", "triangle", "small"], "neg"),(["big", "square", "big"], "pos"),(["small", "square", "small"], "neg"),(["small", "square", "big"], "pos"),(["big", "circle", "big"], "pos")]
print("\n\nTest Data:\n", root.vecs, "\n\nAttributes:\n", attributes, "\n")
split_node(root, attributes)
print("\nAfter splitting:")
for child in root.children:
print("Attribute " + child + " has vector list:\n", root.children[child].vecs, "\n")
'''
attributes = ["White King file", " White King rank", "White Rook file", "White Rook rank", "Black King file", "Black King rank"]
feature_vecs = read_file("550-p1-cset-krk-1.csv")
sets_generator()
print("Tree 1 Training Set IDs: ")
print_ids(Training_Set)
print("Tree 1 Holdout Set IDs: ")
print_ids(Holdout_Set)
print("Validation Set IDs")
print_ids(Validation_Set)
root = node()
root.vecs = Training_Set
if not isLeaf(root):
buildDTree(root)
# print(classifier(root, Holdout_Set[5]))
print("Printing incorrect holdout vectors and first dTree accuracy")
print(accuracy(root, Validation_Set)[0])
Training_Set, Holdout_Set = bagging_replacement(root, Training_Set, Holdout_Set)
print("-------------------------------------------------------------------------------------------------------------")
print("-------------------------------------------------------------------------------------------------------------")
print("Tree 2 Training Set IDs: ")
print_ids(Training_Set)
print("Tree 2 Holdout Set IDs: ")
print_ids(Holdout_Set)
print("Validation Set IDs")
print_ids(Validation_Set)
boostingT_root = node()
boostingT_root.vecs = Training_Set
if not isLeaf(boostingT_root):
buildDTree(boostingT_root)
print("Printing incorrect holdout vectors and second dTree accuracy")
print(accuracy(boostingT_root, Holdout_Set)[0])
def ensemble_classifier(data_set,root):
output=[]
for pair in data_set:
output.append(classifier(root,pair))
return output
print("Printing incorrect vectors in validation classification & accuracy")
accuracy_tree1=accuracy(root, Validation_Set)[0]
accuracy_tree2=accuracy(boostingT_root,Validation_Set)[0]
print("Validation accuracy of 1st decision tree : " + str(accuracy_tree1))
print("Validation accuracy of Boosting tree : " + str(accuracy_tree2))
vector_test=[(['c', '2', 'a', '8', 'a', '1'], 'zero')] # Example dataset/ vector
values=[]
if(accuracy_tree1>accuracy_tree2):
print("As accuracy of 1st decission tree is greater than boosting, giving classification to 1st decission tree (greater weight)")
print("Accuracy of ensemble is equal to 1st decision tree: " + str(accuracy_tree1))
values=ensemble_classifier(vector_test ,root)
else:
print("As accuracy of boosting is greater than 1st decission tree, giving classification to boosting tree (greater weight)")
print("Accuracy of ensemble is equal to boosting tree: " + str(accuracy_tree2))
values=ensemble_classifier(vector_test ,boostingT_root)
# Output of the classified class for the dataset entered
print("Classification of vector: " + str(vector_test) + " is " + str(values))