-
Notifications
You must be signed in to change notification settings - Fork 0
/
logistic_baseline_sklearn.py
95 lines (81 loc) · 3.27 KB
/
logistic_baseline_sklearn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import numpy as np
import pandas as pd
from project_utils import *
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score
from scipy.sparse import hstack
from sys import argv
from word_embeddings import get_tokenized_sentences
APPROACH = "ngram"
CLASSIFIER = "logistic"
FLAVOR = "sklearn-SAG"
use_local_embeddings = False
path = ""
dims = 50
def fit_logistic_ngram_sklearn(train, dev, test, dataset="toxic",
cnames=CLASS_NAMES):
"""Fits and evals ngram logistic regression model with sklearn package.
Args:
train: a pd.DataFrame of the training data.
dev: a pd.DataFrame of the dev data.
vocab:
Returns:
average ROC-AUC score over classes.
"""
if dataset == "toxic":
vecpath = TFIDF_VECTORS_FILE_TOXIC
elif dataset == "attack":
vecpath = TFIDF_VECTORS_FILE_AGG
if use_local_embeddings:
sentences = get_tokenized_sentences(use_local=True, load_files=False, \
local_embeddings_path=path, n_dims=dims)
train_vecs = sentences.get("train").get("vectors")
test_vecs = sentences.get("test").get("vectors")
dev_vecs = sentences.get("dev").get("vectors")
else:
train_vecs, dev_vecs, test_vecs = \
vectorize_corpus_tf_idf(train, dev, test, sparse=True, path=vecpath)
# Doing one-vs-all training
auc_scores = []
for class_name in [cnames[x] for x in range(len(cnames))]:
print('doing class {}'.format(class_name))
# Training model
train_target = train[class_name]
classifier = LogisticRegression(solver='sag')
model = classifier.fit(train_vecs, train_target)
# Computing ROC
test_pred = model.predict_proba(test_vecs)
test_target = get_onehots_from_labels(test[class_name].values)
ROC_AUC_score = roc_auc_score(test_target, test_pred)
auc_scores.append(ROC_AUC_score)
print('--AUC score is {}'.format(ROC_AUC_score))
return auc_scores
if __name__ == '__main__':
myargs = getopts(argv)
if myargs['-dataset'] == 'attack':
if not os.path.isfile(ATTACK_AGGRESSION_FN):
get_and_save_talk_data()
train, dev, test = get_TDT_split(
pd.read_csv(ATTACK_AGGRESSION_FN, index_col=0).fillna(' '))
cnames = train.columns.values[0:2]
aucfn = "auc_scores_attack.csv"
elif myargs['-dataset'] == 'toxic':
train, dev, test = get_TDT_split(
pd.read_csv('train.csv').fillna(' '))
cnames = CLASS_NAMES
aucfn = "auc_scores.csv"
if "-local" in myargs:
local_path = myargs["-local"]
if os.path.isfile(local_path):
if "-dims" in myargs:
dims = int(myargs["-dims"])
use_local_embeddings = True
path = local_path
else:
print "Please speficy the number of dimensions to use custom vectors"
else:
print "Could not find embeddings with that path"
auc_scores = fit_logistic_ngram_sklearn(train, dev, test, dataset=myargs['-dataset'], cnames=cnames)
save_auc_scores(auc_scores, APPROACH, CLASSIFIER, FLAVOR, cnames=cnames,
fn=aucfn)
print('Avg ROC score is {}'.format(np.mean(auc_scores)))