-
Notifications
You must be signed in to change notification settings - Fork 0
/
SupportVectorMachine.py
146 lines (117 loc) · 4.82 KB
/
SupportVectorMachine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
'''Support Vector Machine classifier'''
# DEBUG
# fixes cudart64_110.dll error
#os.add_dll_directory("C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.2/bin")
import os
import json
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report
from dataParser import read_articles, read_single
from BaseModel import BaseModel
from sklearn.model_selection import GridSearchCV
# Support vector machine model
class SupportVectorMachine(BaseModel):
def __init__(self):
self.arguments = [
{
"command": "-cv",
"refer": "--cv",
"default": 3,
"action": None,
"type:": int,
"help": "Determines the cross-validation splitting strategy"
},
]
super().__init__()
self.name = "SupportVectorMachine"
spacy.prefer_gpu()
# load spacy
self.nlp = spacy.load('en_core_web_sm')
# Method that returns lowercase of the input
def identity(self, x):
return x.lower()
def smartJoin(self, x):
# transform list into string
return ''.join([i.lower() for i in x if i.isalpha()])
def spacy_pos(self, txt):
# Part-of-speech transformation
return [ token.pos_.lower() for token in self.nlp(txt.lower())]
# Create the model using gridsearch
def create_model(self):
count = CountVectorizer(preprocessor=self.smartJoin, tokenizer=self.spacy_pos)
tf_idf = TfidfVectorizer(preprocessor=self.identity, tokenizer=self.identity)
union = FeatureUnion([("tf_idf", tf_idf),("count", count)])
return GridSearchCV(
# Combine the union feature with a LinearSVC
estimator=Pipeline([("union", union),('cls', LinearSVC())]),
param_grid=self.param_grid,
cv=self.args.cv,
verbose=3
)
# Train the model using best parameters
def train_model(self, model, X_train, Y_train):
model = model.fit(X_train, Y_train)
self.gs_cv_results = model.cv_results_
self.gs_best_params = model.best_params_
self.gs_best_score = model.best_score_
print(f'best training score {model.best_score_} with params {model.best_params_}')
# return best estimator
return model.best_estimator_
# Normal classification for external test sets
def perform_classification(self, model, X, Y):
Y_pred = model.predict(X)
print(classification_report(Y, Y_pred, target_names=['left-center', 'right-center'], digits=4))
return classification_report(Y, Y_pred, output_dict=True, target_names=['left-center', 'right-center'], digits=3)
# Write results to a file
def write_run_to_file(self, parameters, results):
res_dir = 'results/' + self.name
# make sure (sub)directory exists
os.makedirs(res_dir, exist_ok=True)
# retrieve version based on number of files in directory
path, dirs, files = next(os.walk(res_dir))
version = len(files)
result = {
'parameters' : parameters,
'param_grid': self.param_grid,
'classification_report': results
}
# write results to file
json.dump(result, open('results/' + self.name + '/' + 'experiment_' + str(version).zfill(2) + '.json', 'w'))
if __name__ == "__main__":
svm = SupportVectorMachine()
X_train, Y_train, X_dev, Y_dev, X_test, Y_test = read_articles()
if svm.args.undersample:
X_train, Y_train = svm.under_sample_training_data(X_train, Y_train)
# Optimal parameters from running gridsearch are set here
svm.param_grid = {
'union__tf_idf__max_df': [0.5],
'union__tf_idf__min_df': [0.0001],
'union__tf_idf__ngram_range': [(1,3)],
'cls__C': [0.5]
}
if svm.args.load_model:
model = svm.load_sk_model()
else:
# train
print('Training model')
model = svm.create_model()
model = svm.train_model(model, X_train, Y_train)
# save model
svm.save_sk_model(model)
# run test
if svm.args.test and not svm.args.cop:
print('Using best estimator on Test set')
results = svm.perform_classification(model, X_test, Y_test)
# run dev
elif not svm.args.cop:
print('Using best estimator on Dev set')
results = svm.perform_classification(model, X_dev, Y_dev)
# test model with COP25 edition
if svm.args.cop:
print(f'Predicting {svm.args.cop}')
X_cop, Y_cop = read_single(svm.args.cop)
results = svm.perform_classification(model, X_cop, Y_cop)
svm.write_run_to_file(vars(svm.args), results)