|
| 1 | +# Script for classification in several steps |
| 2 | + |
1 | 3 | library(stylo)
|
| 4 | +library(tools) |
2 | 5 |
|
3 |
| -# load all texts, make freq list and table of frequencies |
| 6 | +# load all texts, make freq list and table of frequencies from the corpus "more than 3 samples |
| 7 | +# so that the frequency list is built based on all novels |
| 8 | +# you also could load a frequency table if one exists |
4 | 9 | set.all <- load.corpus.and.parse(corpus.dir="C:/Users/Administrator/Desktop/Stylo_GitHub/Stylometry_MiMoText_roman18/corpora/more_than_3_samples", corpus.lang="French")
|
5 | 10 | freq.list.all <- make.frequency.list(set.all, head=3000)
|
6 | 11 | frq.table.all <- make.table.of.frequencies(corpus = set.all, features=freq.list.all)
|
7 | 12 |
|
8 | 13 |
|
9 |
| -# split in training and test |
10 |
| -test.list <- file_path_sans_ext(list.files(path="test")) |
11 |
| -train.list <- file_path_sans_ext(list.files(path="training")) |
| 14 | +# split in training and test based on the filenames of the files within the train and the test set folder |
| 15 | + |
| 16 | +test.list <- file_path_sans_ext(list.files(path="test_strat")) |
| 17 | +train.list <- file_path_sans_ext(list.files(path="training_Strat")) |
12 | 18 | print(test.list)
|
13 | 19 |
|
| 20 | +# extract frequencies for train set and test set |
14 | 21 | test.set <- frq.table.all[c(test.list), 1:3000]
|
15 | 22 | train.set <- frq.table.all[c(train.list), 1:3000]
|
16 | 23 |
|
17 | 24 |
|
18 | 25 | print(dim(train.set))
|
19 |
| -### search for best parameters |
| 26 | +print(train.list) |
| 27 | + |
| 28 | + |
| 29 | +########### using perform => preliminary test |
| 30 | + |
| 31 | +results_svm <- perform.svm(train.set, test.set, no.of.candidates = 10) |
| 32 | +summary(results_svm) |
| 33 | +performance.measures(results_svm) |
| 34 | +results_svm$confusion_matrix |
| 35 | +results_svm$y |
| 36 | +results_svm$scores |
| 37 | +results_svm$ranking |
| 38 | + |
20 | 39 |
|
| 40 | +#### perform a cross validation with different classifier methods and plot the results |
| 41 | +# parallel to https://computationalstylistics.github.io/blog/performance_measures/ |
| 42 | + |
| 43 | +mfw_test = seq(from = 100, to=3000, by = 50) |
| 44 | +f1_all = c() |
| 45 | +acc_all= c() |
| 46 | + |
| 47 | +classifier = "naivebayes" # , nsc, "svm", delta, knn, naivebayes |
| 48 | + |
| 49 | +for (m in mfw_test){ |
| 50 | + curr_set = train.set[, 1:m] |
| 51 | + curr_res = crossv(training.set = curr_set, |
| 52 | + cv.folds=50, |
| 53 | + classification.method=classifier) |
| 54 | + perf = performance.measures(curr_res) |
| 55 | + |
| 56 | + get_f1 = perf$avg.f |
| 57 | + acc = perf$accuracy |
| 58 | + |
| 59 | + f1_all = c(f1_all, get_f1) |
| 60 | + acc_all = c(acc_all, acc) |
| 61 | + |
| 62 | +} |
21 | 63 |
|
22 |
| -Data_Frame <- data.frame(Cull=c(0), MFW=c(0), Acc=c(0), Prec=c(0), Recall=c(0), FScore=c(0)) |
23 |
| -Data_Frame |
| 64 | +summary(results_cross_V) |
| 65 | +results_cross_V$confusion_matrix |
| 66 | +results_cross_V$misclassified |
24 | 67 |
|
25 |
| -cull.list <- seq(from = 20, to = 80, by = 10) |
26 |
| -mfw.list <- seq(from = 500, to=3000, by = 100) |
27 |
| -for (x in cull.list){ |
| 68 | +performance.measures(results_cross_V) |
| 69 | + |
| 70 | +# plot the results |
| 71 | +plot(f1_all ~ mfw_test, |
| 72 | + main = "Performace measure: a comparison: Naive Bayes", |
| 73 | + ylab = "accuracy and F1 score", |
| 74 | + xlab = "most freq words", |
| 75 | + ylim = c(0.1, 1), |
| 76 | + col = "blue") |
| 77 | + |
| 78 | +points(acc_all ~ mfw_test, col="red") |
| 79 | +legend("bottomright", |
| 80 | + legend=c("Accuracy", "F1 score"), |
| 81 | + col = c("red", "blue"), |
| 82 | + text.col=c("red", "blue"), |
| 83 | + pch=1, |
| 84 | + bty="n") |
| 85 | + |
| 86 | + |
| 87 | +### as the delta classifier gave the best results: |
| 88 | +# search for best parameters in classifier DELTA using different distance measures and different MFWs |
| 89 | +# save the results in csv files |
| 90 | + |
| 91 | +supported.measures = c("dist.euclidean", "dist.manhattan", "dist.canberra", "dist.delta", |
| 92 | + "dist.eder", "dist.argamon", "dist.simple", "dist.cosine", "dist.wurzburg", |
| 93 | + "dist.entropy", "dist.minmax") |
| 94 | + |
| 95 | + |
| 96 | + |
| 97 | +mfw.list <- seq(from = 500, to=3000, by = 100) |
| 98 | + |
| 99 | +for (s in supported.measures){ |
| 100 | + Data_Frame <- data.frame(Cull=c(0), MFW=c(0), Acc=c(0), Prec=c(0), Recall=c(0), FScore=c(0)) |
28 | 101 | for (y in mfw.list){
|
29 | 102 |
|
30 | 103 | results <- classify(training.frequencies = train.set, test.frequencies = test.set,
|
| 104 | + corpus.lang = "French", |
31 | 105 | mfw.min = y, mfw.max=y, mfw.incr = 100,
|
32 |
| - distance.measure="wurzburg", |
33 |
| - classification.method = "svm", cv.folds = 20 , gui=FALSE, culling.min = x, culling.max = x) |
| 106 | + classification.method = "delta", |
| 107 | + distance.measure=s, |
| 108 | + cv.folds = 50 , gui=FALSE, |
| 109 | + culling.min = 0, culling.max = 0) |
34 | 110 | print(results$overall.success.rate)
|
35 |
| - print(results$success.rate) |
36 |
| - print(performance.measures(results)$accuracy) |
37 |
| - Data_Frame <- rbind(Data_Frame, c(x, y , performance.measures(results)$accuracy, performance.measures(results)$avg.precision,performance.measures(results)$avg.recall, performance.measures(results)$avg.f )) |
| 111 | + #print(results$success.rate) |
| 112 | + #print(performance.measures(results)$accuracy) |
| 113 | + Data_Frame <- rbind(Data_Frame, c(0, y , performance.measures(results)$accuracy, performance.measures(results)$avg.precision,performance.measures(results)$avg.recall, performance.measures(results)$avg.f )) |
| 114 | + |
38 | 115 | }
|
| 116 | + write.csv(Data_Frame, sprintf("classification_delta_output/results_strat_split_w_NM_delta_%s_cull0.csv", s), row.names = TRUE) |
39 | 117 | }
|
40 | 118 |
|
41 |
| -write.csv(Data_Frame, "results_wurzburg_svm.csv", row.names = TRUE) |
|
0 commit comments