-
Notifications
You must be signed in to change notification settings - Fork 0
/
model_pipeline_p.py
115 lines (84 loc) · 2.94 KB
/
model_pipeline_p.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
### preprocessing ###
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import analysis_p
import load_p
import genres_p
import tokens_p
import feature_selection_p
import os
def get_y(df: pd.DataFrame) -> np.array:
y = df['genre'].cat.codes.values
return y
df = load_p.get_df_flow()
# df = genres_p.balance_genre_size(genres_p.parse_genres_flow(df, genres_p.extract_d0s_replace))
df = genres_p.parse_genres_flow(df, genres_p.extract_d0s_replace)
vectorizer, X = tokens_p.tokenize_flow(df, min_df=20, max_features=18000)
print(X.shape)
# confirm vectorizer worked
tokens_p.preview_features(df['body'], X, vectorizer.get_feature_names(), 113, 20)
Z = feature_selection_p.get_pca_features(X, n_components=100)
print(Z.shape)
y = get_y(df)
Xtr, Xts, ytr, yts, Ztr, Zts = train_test_split(X, y, Z, random_state=0)
### ComplementNB ###
from sklearn.naive_bayes import ComplementNB
clf = ComplementNB()
clf.fit(Xtr, ytr)
yhat_clf = clf.predict(Xts)
acc_clf = np.mean(yhat_clf == yts)
print(acc_clf)
### SVM ###
import sklearn.svm as svm
svc = svm.SVC()
svc.fit(Ztr, ytr)
yhat = svc.predict(Zts)
svm_accuracy = np.mean(yhat == yts)
print(svm_accuracy)
import sklearn.metrics as metrics
print(metrics.classification_report(yts, yhat, labels=range(df['genre'].cat.categories.values.size), target_names=df['genre'].cat.categories.values))
analysis_p.misclassified_analysis(yhat, yts, df, "svm_d1_unbalanced")
### NeuralNetwork ###
import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import Conv1D, Flatten, MaxPooling1D,Embedding
import tensorflow.keras.backend as K
K.clear_session()
batch_size = 500
n_epochs = 100
filters = 100
kernels = 3
train = Ztr
test = Zts
train = Ztr
test = Zts
num_classes = df['genre'].value_counts().index.size
model = Sequential()
model.add(Dense(num_classes*500, input_shape=train.shape[-1:], activation = 'relu'))
model.add(Dense(num_classes, activation='softmax'))
from tensorflow.keras import optimizers
opt = optimizers.Adam(lr=0.0001)
hist = model.compile(loss='sparse_categorical_crossentropy',
optimizer=opt,
metrics=['accuracy'])
print(model.summary())
hist = model.fit(train, ytr, batch_size=batch_size,
epochs=100, validation_data=(test, yts),
shuffle=True)
tr_accuracy = hist.history['accuracy']
val_accuracy = hist.history['val_accuracy']
plt.plot(tr_accuracy)
plt.plot(val_accuracy)
plt.grid()
plt.xlabel('epochs')
plt.ylabel('accuarcy')
plt.legend(['training accuracy', 'validation accuracy'])
yhat = model.predict_classes(Zts)
np.unique(yhat)
analysis_p.misclassified_analysis(yhat, yts, df, 'nn_equal_classes_large')
df['genre'].cat.categories
print(metrics.classification_report(yts, yhat, labels=range(num_classes), target_names=df['genre'].cat.categories.values))