-
Notifications
You must be signed in to change notification settings - Fork 0
/
logistic_regression_tf_idf.py
66 lines (56 loc) · 2.7 KB
/
logistic_regression_tf_idf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import re
from scipy.sparse import hstack
train = pd.read_csv("C:/Users/teots/OneDrive/Υπολογιστής/Creating Queries For Batch Indexing/Final_Dataset_Finished_Processing_Train_Val_Test/train.csv").dropna(subset=['Document'])
test = pd.read_csv("C:/Users/teots/OneDrive/Υπολογιστής/Creating Queries For Batch Indexing/Final_Dataset_Finished_Processing_Train_Val_Test/test.csv").dropna(subset=['Document'])
val = pd.read_csv("C:/Users/teots/OneDrive/Υπολογιστής/Creating Queries For Batch Indexing/Final_Dataset_Finished_Processing_Train_Val_Test/val.csv").dropna(subset=['Document'])
tfidf_vectorizer = TfidfVectorizer()
print("Vectorize document...")
X_train_text = tfidf_vectorizer.fit_transform(train['Document'])
X_val_text = tfidf_vectorizer.transform(val['Document'])
X_test_text = tfidf_vectorizer.transform(test['Document'])
print("Vectorization finished")
X_train_numeric = train[['Num_Emoji', 'Num_Bad_Words']].values
X_val_numeric = val[['Num_Emoji', 'Num_Bad_Words']].values
X_test_numeric = test[['Num_Emoji', 'Num_Bad_Words']].values
print(type(X_train_text))
print(X_train_numeric.shape)
# Combine numerical and text features
X_train = hstack((X_train_text, X_train_numeric))
X_val = hstack((X_val_text, X_val_numeric))
X_test = hstack((X_test_text, X_test_numeric))
# Target variable
y_train = train['Credibility']
y_val = val['Credibility']
y_test = test['Credibility']
reg = LogisticRegression(max_iter=100000000)
reg.fit(X_train, y_train)
# Predictions on test set
y_val_pred = reg.predict(X_val)
y_test_pred = reg.predict(X_test)
# Evaluate the val
accuracy = accuracy_score(y_val, y_val_pred)
print("Val Accuracy:", accuracy)
f1 = f1_score(y_val, y_val_pred, average=None)
f1_micro = f1_score(y_val, y_val_pred, average='micro')
f1_macro = f1_score(y_val, y_val_pred, average='macro')
f1_weighted = f1_score(y_val, y_val_pred, average='weighted')
print("Val F1", f1)
print("Val F1 micro", f1_micro)
print("Val F1 macro", f1_macro)
print("Val F1 weighted", f1_weighted, end='\n\n')
# Evaluate the test
accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", accuracy)
f1 = f1_score(y_test, y_test_pred, average=None)
f1_micro = f1_score(y_test, y_test_pred, average='micro')
f1_macro = f1_score(y_test, y_test_pred, average='macro')
f1_weighted = f1_score(y_test, y_test_pred, average='weighted')
print("Test F1", f1)
print("Test F1 micro", f1_micro)
print("Test F1 macro", f1_macro)
print("Test F1 weighted", f1_weighted)