-
Notifications
You must be signed in to change notification settings - Fork 0
/
test.py
51 lines (40 loc) · 1.39 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import csv
import string
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import word_tokenize
all_training = []
all_labels = []
with open('cases.csv', 'r') as f:
reader = csv.DictReader(f)
cases = [row for row in reader]
f.close()
for case in cases:
all_training.append(case["text"])
all_labels.append(case["label"])
def train(classifier, X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
classifier.fit(X_train, y_train)
print ("Accuracy: %s" % classifier.score(X_test, y_test))
return classifier
def stemming_tokenizer(text):
stemmer = PorterStemmer()
return [stemmer.stem(w) for w in word_tokenize(text)]
trial1 = Pipeline([
('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english') + list(string.punctuation))),
('classifier', MultinomialNB(alpha=0.5)),
])
clf = train(trial1, all_training, all_labels)
all_testing = []
with open('predictions.csv', 'r') as f:
reader = csv.DictReader(f)
predictions = [row for row in reader]
f.close()
for prediction in predictions:
all_testing.append(prediction["text"])
answers = clf.predict(all_testing)
print(answers)