MiMoText
diff --git a/Diff for: ‎classification_tests/classification.R
+93-16 b/Diff for: ‎classification_tests/classification.R
+93-16
diff --git a/Diff for: ‎classification_tests/classification_delta_output/results_strat_split_w_NM_delta_dist.argamon_cull0.csv
+28 b/Diff for: ‎classification_tests/classification_delta_output/results_strat_split_w_NM_delta_dist.argamon_cull0.csv
+28
diff --git a/Diff for: ‎classification_tests/classification_delta_output/results_strat_split_w_NM_delta_dist.canberra_cull0.csv
+28 b/Diff for: ‎classification_tests/classification_delta_output/results_strat_split_w_NM_delta_dist.canberra_cull0.csv
+28
diff --git a/Diff for: ‎classification_tests/classification_delta_output/results_strat_split_w_NM_delta_dist.cosine_cull0.csv
+28 b/Diff for: ‎classification_tests/classification_delta_output/results_strat_split_w_NM_delta_dist.cosine_cull0.csv
+28
diff --git a/Diff for: ‎classification_tests/classification_delta_output/results_strat_split_w_NM_delta_dist.delta_cull0.csv
+28 b/Diff for: ‎classification_tests/classification_delta_output/results_strat_split_w_NM_delta_dist.delta_cull0.csv
+28
diff --git a/Diff for: ‎classification_tests/classification_delta_output/results_strat_split_w_NM_delta_dist.eder_cull0.csv
+28 b/Diff for: ‎classification_tests/classification_delta_output/results_strat_split_w_NM_delta_dist.eder_cull0.csv
+28
diff --git a/Diff for: ‎classification_tests/classification_delta_output/results_strat_split_w_NM_delta_dist.entropy_cull0.csv
+28 b/Diff for: ‎classification_tests/classification_delta_output/results_strat_split_w_NM_delta_dist.entropy_cull0.csv
+28
@@ -1,41 +1,118 @@
+# Script for classification in several steps
+
 library(stylo)
+library(tools)
 
-# load all texts, make freq list and table of frequencies
+# load all texts, make freq list and table of frequencies from the corpus "more than 3 samples
+# so that the frequency list is built based on all novels
+# you also could load a frequency table if one exists
 set.all <- load.corpus.and.parse(corpus.dir="C:/Users/Administrator/Desktop/Stylo_GitHub/Stylometry_MiMoText_roman18/corpora/more_than_3_samples", corpus.lang="French")
 freq.list.all <- make.frequency.list(set.all, head=3000)
 frq.table.all <- make.table.of.frequencies(corpus = set.all, features=freq.list.all)
 
 
-# split in training and test 
-test.list <- file_path_sans_ext(list.files(path="test"))
-train.list <- file_path_sans_ext(list.files(path="training"))
+# split in training and test based on the filenames of the files within the train and the test set folder
+
+test.list <- file_path_sans_ext(list.files(path="test_strat"))
+train.list <- file_path_sans_ext(list.files(path="training_Strat"))
 print(test.list)
 
+# extract frequencies for train set and test set
 test.set <- frq.table.all[c(test.list), 1:3000]
 train.set <- frq.table.all[c(train.list), 1:3000]
 
 
 print(dim(train.set))
-### search for best parameters
+print(train.list)
+
+
+########### using perform => preliminary test
+
+results_svm <- perform.svm(train.set, test.set, no.of.candidates = 10)
+summary(results_svm)
+performance.measures(results_svm)
+results_svm$confusion_matrix
+results_svm$y
+results_svm$scores
+results_svm$ranking
+
 
+#### perform a  cross validation with different classifier methods and plot the results
+# parallel to https://computationalstylistics.github.io/blog/performance_measures/
+
+mfw_test = seq(from = 100, to=3000, by = 50)
+f1_all = c()
+acc_all= c()
+
+classifier = "naivebayes" # , nsc, "svm", delta, knn, naivebayes
+
+for (m in mfw_test){
+  curr_set = train.set[, 1:m]
+  curr_res = crossv(training.set = curr_set,
+                          cv.folds=50,
+                    classification.method=classifier)
+  perf = performance.measures(curr_res)
+  
+  get_f1 = perf$avg.f
+  acc = perf$accuracy
+  
+  f1_all = c(f1_all, get_f1)
+  acc_all = c(acc_all, acc)
+
+}
 
-Data_Frame <- data.frame(Cull=c(0), MFW=c(0), Acc=c(0), Prec=c(0), Recall=c(0), FScore=c(0))
-Data_Frame
+summary(results_cross_V)
+results_cross_V$confusion_matrix
+results_cross_V$misclassified
 
-cull.list <- seq(from = 20, to = 80, by = 10)
-mfw.list <- seq(from = 500, to=3000, by = 100)
-for (x in cull.list){
+performance.measures(results_cross_V)
+
+# plot the results
+plot(f1_all ~ mfw_test,
+     main = "Performace measure: a comparison: Naive Bayes",
+     ylab = "accuracy and F1 score",
+     xlab = "most freq words",
+     ylim = c(0.1, 1),
+     col = "blue")
+
+points(acc_all ~ mfw_test, col="red")
+legend("bottomright",
+       legend=c("Accuracy", "F1 score"),
+       col = c("red", "blue"),
+       text.col=c("red", "blue"),
+       pch=1,
+       bty="n")
+
+
+### as the delta classifier gave the best results:
+# search for best parameters in classifier DELTA using different distance measures and different MFWs
+# save the results in csv files
+
+supported.measures = c("dist.euclidean", "dist.manhattan", "dist.canberra", "dist.delta", 
+                       "dist.eder", "dist.argamon", "dist.simple", "dist.cosine", "dist.wurzburg",
+                       "dist.entropy", "dist.minmax")
+
+
+
+mfw.list <- seq(from = 500, to=3000, by = 100) 
+
+for (s in supported.measures){
+  Data_Frame <- data.frame(Cull=c(0), MFW=c(0), Acc=c(0), Prec=c(0), Recall=c(0), FScore=c(0))
   for (y in mfw.list){
 
     results <- classify(training.frequencies = train.set, test.frequencies = test.set,
+                        corpus.lang = "French",
                         mfw.min = y, mfw.max=y, mfw.incr = 100,
-                        distance.measure="wurzburg",
-                        classification.method = "svm", cv.folds = 20 , gui=FALSE, culling.min = x, culling.max = x)
+                        classification.method = "delta", 
+                        distance.measure=s,
+                        cv.folds = 50 , gui=FALSE, 
+                        culling.min = 0, culling.max = 0)
     print(results$overall.success.rate)
-    print(results$success.rate)
-    print(performance.measures(results)$accuracy)
-    Data_Frame <- rbind(Data_Frame, c(x, y , performance.measures(results)$accuracy, performance.measures(results)$avg.precision,performance.measures(results)$avg.recall, performance.measures(results)$avg.f ))
+    #print(results$success.rate)
+    #print(performance.measures(results)$accuracy)
+    Data_Frame <- rbind(Data_Frame, c(0, y , performance.measures(results)$accuracy, performance.measures(results)$avg.precision,performance.measures(results)$avg.recall, performance.measures(results)$avg.f ))
+    
   }
+  write.csv(Data_Frame, sprintf("classification_delta_output/results_strat_split_w_NM_delta_%s_cull0.csv", s), row.names = TRUE)
 }
 
-write.csv(Data_Frame, "results_wurzburg_svm.csv", row.names = TRUE)
 
@@ -0,0 +1,28 @@
+"","Cull","MFW","Acc","Prec","Recall","FScore"
+"1",0,0,0,0,0,0
+"2",0,500,0.536111111111111,0.491642273550933,0.589090909090909,0.484196423212636
+"3",0,600,0.516666666666667,0.467553090165916,0.562272727272727,0.46068965819797
+"4",0,700,0.530555555555556,0.471640004000178,0.579545454545455,0.479533799267748
+"5",0,800,0.539444444444444,0.503798497072386,0.597954545454545,0.492554901434409
+"6",0,900,0.522777777777778,0.476892384073208,0.569772727272727,0.475887922930252
+"7",0,1000,0.503333333333333,0.508506011435757,0.548863636363636,0.46644428642293
+"8",0,1100,0.517777777777778,0.521768571066033,0.575454545454545,0.499095358911106
+"9",0,1200,0.507222222222222,0.531324784984113,0.57,0.492627931245457
+"10",0,1300,0.491111111111111,0.52706237199058,0.548863636363636,0.482110427596542
+"11",0,1400,0.501111111111111,0.494522211534884,0.557045454545455,0.478794423872439
+"12",0,1500,0.474444444444444,0.506067947379488,0.525909090909091,0.462296699782617
+"13",0,1600,0.443888888888889,0.482375225611685,0.502727272727273,0.429334054346709
+"14",0,1700,0.445,0.459734324771831,0.489772727272727,0.404939165494867
+"15",0,1800,0.463888888888889,0.464801123566407,0.525,0.438420858752579
+"16",0,1900,0.471111111111111,0.472661603723823,0.533409090909091,0.446507337013652
+"17",0,2000,0.471666666666667,0.468061222486297,0.534090909090909,0.441994336049943
+"18",0,2100,0.461111111111111,0.459188768872753,0.512727272727273,0.427380922818582
+"19",0,2200,0.461666666666667,0.452054483239642,0.509772727272727,0.425509726603306
+"20",0,2300,0.438333333333333,0.438708107895206,0.478636363636364,0.402039193066241
+"21",0,2400,0.451111111111111,0.443772560116855,0.502272727272727,0.415659147306223
+"22",0,2500,0.452222222222222,0.424502290360737,0.493863636363636,0.40417252609562
+"23",0,2600,0.427777777777778,0.417477598097476,0.470454545454545,0.386235446408656
+"24",0,2700,0.442222222222222,0.416393242155991,0.481136363636364,0.390085690767043
+"25",0,2800,0.417222222222222,0.395127228528768,0.455681818181818,0.374636649004488
+"26",0,2900,0.427777777777778,0.419262327718345,0.459772727272727,0.38204157906396
+"27",0,3000,0.395,0.417333252670674,0.445909090909091,0.365155017543171
@@ -0,0 +1,28 @@
+"","Cull","MFW","Acc","Prec","Recall","FScore"
+"1",0,0,0,0,0,0
+"2",0,500,0.517777777777778,0.524053490840345,0.583181818181818,0.491851023258427
+"3",0,600,0.491666666666667,0.459930585122767,0.551590909090909,0.447101270341223
+"4",0,700,0.516666666666667,0.558361480760282,0.584318181818182,0.488304722364716
+"5",0,800,0.512777777777778,0.541931294794597,0.5925,0.495723028934705
+"6",0,900,0.525,0.5383915855386,0.599090909090909,0.504697086516899
+"7",0,1000,0.504444444444444,0.526192266696329,0.565454545454545,0.47857772468584
+"8",0,1100,0.531666666666667,0.524505774190619,0.605227272727273,0.511017219493213
+"9",0,1200,0.514444444444444,0.509111459112569,0.570909090909091,0.479312736961499
+"10",0,1300,0.517222222222222,0.517217020526195,0.587272727272727,0.493134231836586
+"11",0,1400,0.536111111111111,0.521457735893076,0.615227272727273,0.51682391535159
+"12",0,1500,0.545,0.520129245094217,0.627954545454545,0.525579360670265
+"13",0,1600,0.542777777777778,0.522329925737345,0.622727272727273,0.520191695961654
+"14",0,1700,0.551666666666667,0.521606055645083,0.625909090909091,0.5236117637552
+"15",0,1800,0.555555555555556,0.540497886697694,0.634772727272727,0.536985067681814
+"16",0,1900,0.556666666666667,0.529397233911626,0.621136363636364,0.527317595159751
+"17",0,2000,0.560555555555556,0.547082108129042,0.625909090909091,0.537900771908436
+"18",0,2100,0.553888888888889,0.530422517815721,0.631136363636364,0.532959323450601
+"19",0,2200,0.577777777777778,0.5296753355471,0.654772727272727,0.54749379077562
+"20",0,2300,0.580555555555556,0.557172066579425,0.664772727272727,0.562278680511215
+"21",0,2400,0.569444444444444,0.542558490280837,0.636363636363636,0.543562955567782
+"22",0,2500,0.58,0.540617114514193,0.656363636363636,0.552823613586717
+"23",0,2600,0.573333333333333,0.533564301571636,0.642727272727273,0.543076467630639
+"24",0,2700,0.578333333333333,0.538503854259759,0.648181818181818,0.548496440996732
+"25",0,2800,0.598888888888889,0.531544423080157,0.666590909090909,0.555199376683716
+"26",0,2900,0.590555555555556,0.537970621380407,0.658636363636364,0.555588071917329
+"27",0,3000,0.588888888888889,0.543647052612872,0.654318181818182,0.551432185884345
@@ -0,0 +1,28 @@
+"","Cull","MFW","Acc","Prec","Recall","FScore"
+"1",0,0,0,0,0,0
+"2",0,500,0.434444444444444,0.334798531660954,0.472727272727273,0.376004852744626
+"3",0,600,0.452777777777778,0.366641811351638,0.508409090909091,0.40776767883881
+"4",0,700,0.445,0.359864429238444,0.494545454545455,0.398068616720506
+"5",0,800,0.433888888888889,0.352914946040426,0.477272727272727,0.382861167404163
+"6",0,900,0.435,0.352757096519085,0.477727272727273,0.385633792340062
+"7",0,1000,0.430555555555556,0.352089446461903,0.477272727272727,0.382011169600982
+"8",0,1100,0.428333333333333,0.339922313819914,0.473863636363636,0.37535204580021
+"9",0,1200,0.422777777777778,0.33832349539891,0.466818181818182,0.374141170527968
+"10",0,1300,0.431666666666667,0.349152596837277,0.480227272727273,0.385601275062287
+"11",0,1400,0.424444444444444,0.347888024322213,0.475454545454545,0.3806330646283
+"12",0,1500,0.426666666666667,0.351337931395381,0.477272727272727,0.384310439603572
+"13",0,1600,0.429444444444444,0.354473395241123,0.483636363636364,0.389704536985872
+"14",0,1700,0.413888888888889,0.34823371579765,0.470227272727273,0.377625904561944
+"15",0,1800,0.423888888888889,0.342611044313408,0.468181818181818,0.380814651971054
+"16",0,1900,0.426111111111111,0.352738038583318,0.483181818181818,0.388581959824732
+"17",0,2000,0.428333333333333,0.351469821256641,0.483636363636364,0.390163155751426
+"18",0,2100,0.423888888888889,0.347300043752378,0.488181818181818,0.385904722470778
+"19",0,2200,0.435,0.353909349785672,0.494090909090909,0.393542501889073
+"20",0,2300,0.413333333333333,0.336638107758224,0.468181818181818,0.372341785318722
+"21",0,2400,0.404444444444444,0.339802749363837,0.468409090909091,0.370991926013986
+"22",0,2500,0.421666666666667,0.352330957806605,0.477272727272727,0.384404127125928
+"23",0,2600,0.422222222222222,0.344382593127139,0.472727272727273,0.380554309131494
+"24",0,2700,0.43,0.360843939799592,0.494545454545455,0.396026510671865
+"25",0,2800,0.427777777777778,0.365010400279113,0.490681818181818,0.3970268866383
+"26",0,2900,0.425555555555556,0.359915298700452,0.483181818181818,0.393550725063225
+"27",0,3000,0.425555555555556,0.35179740487817,0.4825,0.387968807755733
@@ -0,0 +1,28 @@
+"","Cull","MFW","Acc","Prec","Recall","FScore"
+"1",0,0,0,0,0,0
+"2",0,500,0.579444444444444,0.499553039239186,0.628181818181818,0.524174962545075
+"3",0,600,0.587777777777778,0.526796359660673,0.640681818181818,0.544262692637905
+"4",0,700,0.588333333333333,0.516015977385766,0.6375,0.538457629651393
+"5",0,800,0.603888888888889,0.517480552947162,0.667272727272727,0.557283644689707
+"6",0,900,0.617222222222222,0.52306718166709,0.675,0.563283214537113
+"7",0,1000,0.610555555555556,0.527298576890319,0.667954545454545,0.560336755692528
+"8",0,1100,0.610555555555556,0.537938090855464,0.665909090909091,0.565066560189276
+"9",0,1200,0.606666666666667,0.553089070678173,0.664318181818182,0.565649007662206
+"10",0,1300,0.611666666666667,0.548777201698188,0.671136363636364,0.571817751931327
+"11",0,1400,0.599444444444444,0.555833824968888,0.6575,0.563998108617439
+"12",0,1500,0.616111111111111,0.565979362844175,0.672045454545455,0.579538770725535
+"13",0,1600,0.605555555555556,0.544841243146235,0.657272727272727,0.560023907754932
+"14",0,1700,0.612777777777778,0.540734860366349,0.667272727272727,0.568870274934539
+"15",0,1800,0.608888888888889,0.547408874986209,0.662727272727273,0.567709862038573
+"16",0,1900,0.615,0.543619563890045,0.6725,0.571854521314704
+"17",0,2000,0.623333333333333,0.565527400939592,0.691136363636364,0.589890708882302
+"18",0,2100,0.624444444444444,0.563591721442725,0.697727272727273,0.590501577250892
+"19",0,2200,0.614444444444444,0.563627549260094,0.680681818181818,0.583567573456885
+"20",0,2300,0.617777777777778,0.566283537620205,0.685,0.587975437795466
+"21",0,2400,0.602777777777778,0.531915694627446,0.658409090909091,0.559679744195535
+"22",0,2500,0.608888888888889,0.532216542064808,0.666590909090909,0.565007951290292
+"23",0,2600,0.613333333333333,0.539145580724661,0.677954545454545,0.571927936470008
+"24",0,2700,0.616111111111111,0.541083592141667,0.676363636363636,0.573477944579542
+"25",0,2800,0.618888888888889,0.543245245244746,0.682727272727273,0.579076343888977
+"26",0,2900,0.612222222222222,0.535628539358398,0.674090909090909,0.569519047057036
+"27",0,3000,0.616666666666667,0.535140552371747,0.677045454545455,0.569857853218648
@@ -0,0 +1,28 @@
+"","Cull","MFW","Acc","Prec","Recall","FScore"
+"1",0,0,0,0,0,0
+"2",0,500,0.579444444444444,0.509519021768661,0.630227272727273,0.532389358270023
+"3",0,600,0.577777777777778,0.504954509977135,0.626363636363636,0.525975751565741
+"4",0,700,0.575,0.485993377343196,0.621136363636364,0.517188136967143
+"5",0,800,0.58,0.489719348982956,0.628409090909091,0.522964355081834
+"6",0,900,0.578888888888889,0.49144303393166,0.627727272727273,0.521618224272552
+"7",0,1000,0.586666666666667,0.494109279281922,0.6375,0.52945323395123
+"8",0,1100,0.594444444444444,0.507061846431522,0.649090909090909,0.541313818682086
+"9",0,1200,0.592777777777778,0.506515528498998,0.649772727272727,0.542754183124284
+"10",0,1300,0.607777777777778,0.512213115965699,0.667045454545454,0.556357942742845
+"11",0,1400,0.622222222222222,0.521495171580401,0.685,0.570504096998696
+"12",0,1500,0.611666666666667,0.523714458045979,0.672045454545455,0.562801404886591
+"13",0,1600,0.617222222222222,0.530519446604271,0.676363636363636,0.570829022201779
+"14",0,1700,0.630555555555556,0.535338688548613,0.692045454545455,0.579729936385024
+"15",0,1800,0.629444444444444,0.542541323142045,0.698409090909091,0.583986255059468
+"16",0,1900,0.625,0.542397609891922,0.69,0.580757812388159
+"17",0,2000,0.627777777777778,0.552830891023163,0.696590909090909,0.587515970294976
+"18",0,2100,0.632777777777778,0.561790538799719,0.700909090909091,0.591470725047821
+"19",0,2200,0.627222222222222,0.566065107959613,0.696136363636364,0.592838584094313
+"20",0,2300,0.623888888888889,0.574170699194382,0.693863636363636,0.591297964201674
+"21",0,2400,0.622222222222222,0.565694277173436,0.689318181818182,0.59041792482338
+"22",0,2500,0.622777777777778,0.565388998974683,0.694318181818182,0.589638650256154
+"23",0,2600,0.618333333333333,0.566889869261789,0.685227272727273,0.588397030429622
+"24",0,2700,0.623333333333333,0.564712465520854,0.685909090909091,0.589013204622039
+"25",0,2800,0.630555555555556,0.571106139162416,0.702272727272727,0.597352816608173
+"26",0,2900,0.630555555555556,0.572646511119486,0.701818181818182,0.598187064596545
+"27",0,3000,0.623333333333333,0.559978237084413,0.689772727272727,0.587061879089773
@@ -0,0 +1,28 @@
+"","Cull","MFW","Acc","Prec","Recall","FScore"
+"1",0,0,0,0,0,0
+"2",0,500,0.553333333333333,0.468965802653616,0.615681818181818,0.508663889491787
+"3",0,600,0.541666666666667,0.462043194412632,0.596818181818182,0.496975668037242
+"4",0,700,0.561111111111111,0.472208909423147,0.62,0.513569860216849
+"5",0,800,0.567777777777778,0.490184663651787,0.634772727272727,0.532728945257861
+"6",0,900,0.556111111111111,0.474820985515492,0.616363636363636,0.515563387134152
+"7",0,1000,0.559444444444444,0.486755341849056,0.615909090909091,0.519459781917947
+"8",0,1100,0.555,0.493134078283794,0.614090909090909,0.518740196059201
+"9",0,1200,0.553888888888889,0.494956698772908,0.617272727272727,0.518520663881043
+"10",0,1300,0.563888888888889,0.495456599279077,0.622045454545455,0.525966599814055
+"11",0,1400,0.56,0.510321954286447,0.624090909090909,0.535119820619195
+"12",0,1500,0.572777777777778,0.526673937923773,0.6425,0.550061333997066
+"13",0,1600,0.561111111111111,0.505739936055324,0.627954545454545,0.531618617915197
+"14",0,1700,0.552777777777778,0.495800926179512,0.612954545454545,0.523086262816631
+"15",0,1800,0.571111111111111,0.520244428558704,0.637045454545455,0.542519207456255
+"16",0,1900,0.563888888888889,0.507021185479421,0.621590909090909,0.532803254745669
+"17",0,2000,0.569444444444444,0.520343010867672,0.636818181818182,0.547494852871178
+"18",0,2100,0.565,0.517643290828336,0.629545454545455,0.538491560986838
+"19",0,2200,0.562777777777778,0.525519807300425,0.633181818181818,0.541111231128672
+"20",0,2300,0.560555555555556,0.515132047733309,0.622045454545455,0.53269237319788
+"21",0,2400,0.556111111111111,0.518594566554901,0.614545454545455,0.529066903975926
+"22",0,2500,0.558333333333333,0.509330434214452,0.618181818181818,0.529919999918858
+"23",0,2600,0.557222222222222,0.50839251076083,0.6125,0.522984156745081
+"24",0,2700,0.561666666666667,0.515802436153741,0.620227272727273,0.5317634967877
+"25",0,2800,0.551666666666667,0.505019801420831,0.614545454545455,0.524709382255298
+"26",0,2900,0.575555555555556,0.526824012136999,0.637954545454545,0.546780048780177
+"27",0,3000,0.566666666666667,0.52370769384044,0.6275,0.535010306103089