Skip to content

Commit c135a73

Browse files
committed
added new files
1 parent d977b82 commit c135a73

File tree

128 files changed

+63361
-16
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

128 files changed

+63361
-16
lines changed

Diff for: classification_tests/classification.R

+93-16
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,118 @@
1+
# Script for classification in several steps
2+
13
library(stylo)
4+
library(tools)
25

3-
# load all texts, make freq list and table of frequencies
6+
# load all texts, make freq list and table of frequencies from the corpus "more than 3 samples
7+
# so that the frequency list is built based on all novels
8+
# you also could load a frequency table if one exists
49
set.all <- load.corpus.and.parse(corpus.dir="C:/Users/Administrator/Desktop/Stylo_GitHub/Stylometry_MiMoText_roman18/corpora/more_than_3_samples", corpus.lang="French")
510
freq.list.all <- make.frequency.list(set.all, head=3000)
611
frq.table.all <- make.table.of.frequencies(corpus = set.all, features=freq.list.all)
712

813

9-
# split in training and test
10-
test.list <- file_path_sans_ext(list.files(path="test"))
11-
train.list <- file_path_sans_ext(list.files(path="training"))
14+
# split in training and test based on the filenames of the files within the train and the test set folder
15+
16+
test.list <- file_path_sans_ext(list.files(path="test_strat"))
17+
train.list <- file_path_sans_ext(list.files(path="training_Strat"))
1218
print(test.list)
1319

20+
# extract frequencies for train set and test set
1421
test.set <- frq.table.all[c(test.list), 1:3000]
1522
train.set <- frq.table.all[c(train.list), 1:3000]
1623

1724

1825
print(dim(train.set))
19-
### search for best parameters
26+
print(train.list)
27+
28+
29+
########### using perform => preliminary test
30+
31+
results_svm <- perform.svm(train.set, test.set, no.of.candidates = 10)
32+
summary(results_svm)
33+
performance.measures(results_svm)
34+
results_svm$confusion_matrix
35+
results_svm$y
36+
results_svm$scores
37+
results_svm$ranking
38+
2039

40+
#### perform a cross validation with different classifier methods and plot the results
41+
# parallel to https://computationalstylistics.github.io/blog/performance_measures/
42+
43+
mfw_test = seq(from = 100, to=3000, by = 50)
44+
f1_all = c()
45+
acc_all= c()
46+
47+
classifier = "naivebayes" # , nsc, "svm", delta, knn, naivebayes
48+
49+
for (m in mfw_test){
50+
curr_set = train.set[, 1:m]
51+
curr_res = crossv(training.set = curr_set,
52+
cv.folds=50,
53+
classification.method=classifier)
54+
perf = performance.measures(curr_res)
55+
56+
get_f1 = perf$avg.f
57+
acc = perf$accuracy
58+
59+
f1_all = c(f1_all, get_f1)
60+
acc_all = c(acc_all, acc)
61+
62+
}
2163

22-
Data_Frame <- data.frame(Cull=c(0), MFW=c(0), Acc=c(0), Prec=c(0), Recall=c(0), FScore=c(0))
23-
Data_Frame
64+
summary(results_cross_V)
65+
results_cross_V$confusion_matrix
66+
results_cross_V$misclassified
2467

25-
cull.list <- seq(from = 20, to = 80, by = 10)
26-
mfw.list <- seq(from = 500, to=3000, by = 100)
27-
for (x in cull.list){
68+
performance.measures(results_cross_V)
69+
70+
# plot the results
71+
plot(f1_all ~ mfw_test,
72+
main = "Performace measure: a comparison: Naive Bayes",
73+
ylab = "accuracy and F1 score",
74+
xlab = "most freq words",
75+
ylim = c(0.1, 1),
76+
col = "blue")
77+
78+
points(acc_all ~ mfw_test, col="red")
79+
legend("bottomright",
80+
legend=c("Accuracy", "F1 score"),
81+
col = c("red", "blue"),
82+
text.col=c("red", "blue"),
83+
pch=1,
84+
bty="n")
85+
86+
87+
### as the delta classifier gave the best results:
88+
# search for best parameters in classifier DELTA using different distance measures and different MFWs
89+
# save the results in csv files
90+
91+
supported.measures = c("dist.euclidean", "dist.manhattan", "dist.canberra", "dist.delta",
92+
"dist.eder", "dist.argamon", "dist.simple", "dist.cosine", "dist.wurzburg",
93+
"dist.entropy", "dist.minmax")
94+
95+
96+
97+
mfw.list <- seq(from = 500, to=3000, by = 100)
98+
99+
for (s in supported.measures){
100+
Data_Frame <- data.frame(Cull=c(0), MFW=c(0), Acc=c(0), Prec=c(0), Recall=c(0), FScore=c(0))
28101
for (y in mfw.list){
29102

30103
results <- classify(training.frequencies = train.set, test.frequencies = test.set,
104+
corpus.lang = "French",
31105
mfw.min = y, mfw.max=y, mfw.incr = 100,
32-
distance.measure="wurzburg",
33-
classification.method = "svm", cv.folds = 20 , gui=FALSE, culling.min = x, culling.max = x)
106+
classification.method = "delta",
107+
distance.measure=s,
108+
cv.folds = 50 , gui=FALSE,
109+
culling.min = 0, culling.max = 0)
34110
print(results$overall.success.rate)
35-
print(results$success.rate)
36-
print(performance.measures(results)$accuracy)
37-
Data_Frame <- rbind(Data_Frame, c(x, y , performance.measures(results)$accuracy, performance.measures(results)$avg.precision,performance.measures(results)$avg.recall, performance.measures(results)$avg.f ))
111+
#print(results$success.rate)
112+
#print(performance.measures(results)$accuracy)
113+
Data_Frame <- rbind(Data_Frame, c(0, y , performance.measures(results)$accuracy, performance.measures(results)$avg.precision,performance.measures(results)$avg.recall, performance.measures(results)$avg.f ))
114+
38115
}
116+
write.csv(Data_Frame, sprintf("classification_delta_output/results_strat_split_w_NM_delta_%s_cull0.csv", s), row.names = TRUE)
39117
}
40118

41-
write.csv(Data_Frame, "results_wurzburg_svm.csv", row.names = TRUE)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
"","Cull","MFW","Acc","Prec","Recall","FScore"
2+
"1",0,0,0,0,0,0
3+
"2",0,500,0.536111111111111,0.491642273550933,0.589090909090909,0.484196423212636
4+
"3",0,600,0.516666666666667,0.467553090165916,0.562272727272727,0.46068965819797
5+
"4",0,700,0.530555555555556,0.471640004000178,0.579545454545455,0.479533799267748
6+
"5",0,800,0.539444444444444,0.503798497072386,0.597954545454545,0.492554901434409
7+
"6",0,900,0.522777777777778,0.476892384073208,0.569772727272727,0.475887922930252
8+
"7",0,1000,0.503333333333333,0.508506011435757,0.548863636363636,0.46644428642293
9+
"8",0,1100,0.517777777777778,0.521768571066033,0.575454545454545,0.499095358911106
10+
"9",0,1200,0.507222222222222,0.531324784984113,0.57,0.492627931245457
11+
"10",0,1300,0.491111111111111,0.52706237199058,0.548863636363636,0.482110427596542
12+
"11",0,1400,0.501111111111111,0.494522211534884,0.557045454545455,0.478794423872439
13+
"12",0,1500,0.474444444444444,0.506067947379488,0.525909090909091,0.462296699782617
14+
"13",0,1600,0.443888888888889,0.482375225611685,0.502727272727273,0.429334054346709
15+
"14",0,1700,0.445,0.459734324771831,0.489772727272727,0.404939165494867
16+
"15",0,1800,0.463888888888889,0.464801123566407,0.525,0.438420858752579
17+
"16",0,1900,0.471111111111111,0.472661603723823,0.533409090909091,0.446507337013652
18+
"17",0,2000,0.471666666666667,0.468061222486297,0.534090909090909,0.441994336049943
19+
"18",0,2100,0.461111111111111,0.459188768872753,0.512727272727273,0.427380922818582
20+
"19",0,2200,0.461666666666667,0.452054483239642,0.509772727272727,0.425509726603306
21+
"20",0,2300,0.438333333333333,0.438708107895206,0.478636363636364,0.402039193066241
22+
"21",0,2400,0.451111111111111,0.443772560116855,0.502272727272727,0.415659147306223
23+
"22",0,2500,0.452222222222222,0.424502290360737,0.493863636363636,0.40417252609562
24+
"23",0,2600,0.427777777777778,0.417477598097476,0.470454545454545,0.386235446408656
25+
"24",0,2700,0.442222222222222,0.416393242155991,0.481136363636364,0.390085690767043
26+
"25",0,2800,0.417222222222222,0.395127228528768,0.455681818181818,0.374636649004488
27+
"26",0,2900,0.427777777777778,0.419262327718345,0.459772727272727,0.38204157906396
28+
"27",0,3000,0.395,0.417333252670674,0.445909090909091,0.365155017543171
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
"","Cull","MFW","Acc","Prec","Recall","FScore"
2+
"1",0,0,0,0,0,0
3+
"2",0,500,0.517777777777778,0.524053490840345,0.583181818181818,0.491851023258427
4+
"3",0,600,0.491666666666667,0.459930585122767,0.551590909090909,0.447101270341223
5+
"4",0,700,0.516666666666667,0.558361480760282,0.584318181818182,0.488304722364716
6+
"5",0,800,0.512777777777778,0.541931294794597,0.5925,0.495723028934705
7+
"6",0,900,0.525,0.5383915855386,0.599090909090909,0.504697086516899
8+
"7",0,1000,0.504444444444444,0.526192266696329,0.565454545454545,0.47857772468584
9+
"8",0,1100,0.531666666666667,0.524505774190619,0.605227272727273,0.511017219493213
10+
"9",0,1200,0.514444444444444,0.509111459112569,0.570909090909091,0.479312736961499
11+
"10",0,1300,0.517222222222222,0.517217020526195,0.587272727272727,0.493134231836586
12+
"11",0,1400,0.536111111111111,0.521457735893076,0.615227272727273,0.51682391535159
13+
"12",0,1500,0.545,0.520129245094217,0.627954545454545,0.525579360670265
14+
"13",0,1600,0.542777777777778,0.522329925737345,0.622727272727273,0.520191695961654
15+
"14",0,1700,0.551666666666667,0.521606055645083,0.625909090909091,0.5236117637552
16+
"15",0,1800,0.555555555555556,0.540497886697694,0.634772727272727,0.536985067681814
17+
"16",0,1900,0.556666666666667,0.529397233911626,0.621136363636364,0.527317595159751
18+
"17",0,2000,0.560555555555556,0.547082108129042,0.625909090909091,0.537900771908436
19+
"18",0,2100,0.553888888888889,0.530422517815721,0.631136363636364,0.532959323450601
20+
"19",0,2200,0.577777777777778,0.5296753355471,0.654772727272727,0.54749379077562
21+
"20",0,2300,0.580555555555556,0.557172066579425,0.664772727272727,0.562278680511215
22+
"21",0,2400,0.569444444444444,0.542558490280837,0.636363636363636,0.543562955567782
23+
"22",0,2500,0.58,0.540617114514193,0.656363636363636,0.552823613586717
24+
"23",0,2600,0.573333333333333,0.533564301571636,0.642727272727273,0.543076467630639
25+
"24",0,2700,0.578333333333333,0.538503854259759,0.648181818181818,0.548496440996732
26+
"25",0,2800,0.598888888888889,0.531544423080157,0.666590909090909,0.555199376683716
27+
"26",0,2900,0.590555555555556,0.537970621380407,0.658636363636364,0.555588071917329
28+
"27",0,3000,0.588888888888889,0.543647052612872,0.654318181818182,0.551432185884345
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
"","Cull","MFW","Acc","Prec","Recall","FScore"
2+
"1",0,0,0,0,0,0
3+
"2",0,500,0.434444444444444,0.334798531660954,0.472727272727273,0.376004852744626
4+
"3",0,600,0.452777777777778,0.366641811351638,0.508409090909091,0.40776767883881
5+
"4",0,700,0.445,0.359864429238444,0.494545454545455,0.398068616720506
6+
"5",0,800,0.433888888888889,0.352914946040426,0.477272727272727,0.382861167404163
7+
"6",0,900,0.435,0.352757096519085,0.477727272727273,0.385633792340062
8+
"7",0,1000,0.430555555555556,0.352089446461903,0.477272727272727,0.382011169600982
9+
"8",0,1100,0.428333333333333,0.339922313819914,0.473863636363636,0.37535204580021
10+
"9",0,1200,0.422777777777778,0.33832349539891,0.466818181818182,0.374141170527968
11+
"10",0,1300,0.431666666666667,0.349152596837277,0.480227272727273,0.385601275062287
12+
"11",0,1400,0.424444444444444,0.347888024322213,0.475454545454545,0.3806330646283
13+
"12",0,1500,0.426666666666667,0.351337931395381,0.477272727272727,0.384310439603572
14+
"13",0,1600,0.429444444444444,0.354473395241123,0.483636363636364,0.389704536985872
15+
"14",0,1700,0.413888888888889,0.34823371579765,0.470227272727273,0.377625904561944
16+
"15",0,1800,0.423888888888889,0.342611044313408,0.468181818181818,0.380814651971054
17+
"16",0,1900,0.426111111111111,0.352738038583318,0.483181818181818,0.388581959824732
18+
"17",0,2000,0.428333333333333,0.351469821256641,0.483636363636364,0.390163155751426
19+
"18",0,2100,0.423888888888889,0.347300043752378,0.488181818181818,0.385904722470778
20+
"19",0,2200,0.435,0.353909349785672,0.494090909090909,0.393542501889073
21+
"20",0,2300,0.413333333333333,0.336638107758224,0.468181818181818,0.372341785318722
22+
"21",0,2400,0.404444444444444,0.339802749363837,0.468409090909091,0.370991926013986
23+
"22",0,2500,0.421666666666667,0.352330957806605,0.477272727272727,0.384404127125928
24+
"23",0,2600,0.422222222222222,0.344382593127139,0.472727272727273,0.380554309131494
25+
"24",0,2700,0.43,0.360843939799592,0.494545454545455,0.396026510671865
26+
"25",0,2800,0.427777777777778,0.365010400279113,0.490681818181818,0.3970268866383
27+
"26",0,2900,0.425555555555556,0.359915298700452,0.483181818181818,0.393550725063225
28+
"27",0,3000,0.425555555555556,0.35179740487817,0.4825,0.387968807755733
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
"","Cull","MFW","Acc","Prec","Recall","FScore"
2+
"1",0,0,0,0,0,0
3+
"2",0,500,0.579444444444444,0.499553039239186,0.628181818181818,0.524174962545075
4+
"3",0,600,0.587777777777778,0.526796359660673,0.640681818181818,0.544262692637905
5+
"4",0,700,0.588333333333333,0.516015977385766,0.6375,0.538457629651393
6+
"5",0,800,0.603888888888889,0.517480552947162,0.667272727272727,0.557283644689707
7+
"6",0,900,0.617222222222222,0.52306718166709,0.675,0.563283214537113
8+
"7",0,1000,0.610555555555556,0.527298576890319,0.667954545454545,0.560336755692528
9+
"8",0,1100,0.610555555555556,0.537938090855464,0.665909090909091,0.565066560189276
10+
"9",0,1200,0.606666666666667,0.553089070678173,0.664318181818182,0.565649007662206
11+
"10",0,1300,0.611666666666667,0.548777201698188,0.671136363636364,0.571817751931327
12+
"11",0,1400,0.599444444444444,0.555833824968888,0.6575,0.563998108617439
13+
"12",0,1500,0.616111111111111,0.565979362844175,0.672045454545455,0.579538770725535
14+
"13",0,1600,0.605555555555556,0.544841243146235,0.657272727272727,0.560023907754932
15+
"14",0,1700,0.612777777777778,0.540734860366349,0.667272727272727,0.568870274934539
16+
"15",0,1800,0.608888888888889,0.547408874986209,0.662727272727273,0.567709862038573
17+
"16",0,1900,0.615,0.543619563890045,0.6725,0.571854521314704
18+
"17",0,2000,0.623333333333333,0.565527400939592,0.691136363636364,0.589890708882302
19+
"18",0,2100,0.624444444444444,0.563591721442725,0.697727272727273,0.590501577250892
20+
"19",0,2200,0.614444444444444,0.563627549260094,0.680681818181818,0.583567573456885
21+
"20",0,2300,0.617777777777778,0.566283537620205,0.685,0.587975437795466
22+
"21",0,2400,0.602777777777778,0.531915694627446,0.658409090909091,0.559679744195535
23+
"22",0,2500,0.608888888888889,0.532216542064808,0.666590909090909,0.565007951290292
24+
"23",0,2600,0.613333333333333,0.539145580724661,0.677954545454545,0.571927936470008
25+
"24",0,2700,0.616111111111111,0.541083592141667,0.676363636363636,0.573477944579542
26+
"25",0,2800,0.618888888888889,0.543245245244746,0.682727272727273,0.579076343888977
27+
"26",0,2900,0.612222222222222,0.535628539358398,0.674090909090909,0.569519047057036
28+
"27",0,3000,0.616666666666667,0.535140552371747,0.677045454545455,0.569857853218648
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
"","Cull","MFW","Acc","Prec","Recall","FScore"
2+
"1",0,0,0,0,0,0
3+
"2",0,500,0.579444444444444,0.509519021768661,0.630227272727273,0.532389358270023
4+
"3",0,600,0.577777777777778,0.504954509977135,0.626363636363636,0.525975751565741
5+
"4",0,700,0.575,0.485993377343196,0.621136363636364,0.517188136967143
6+
"5",0,800,0.58,0.489719348982956,0.628409090909091,0.522964355081834
7+
"6",0,900,0.578888888888889,0.49144303393166,0.627727272727273,0.521618224272552
8+
"7",0,1000,0.586666666666667,0.494109279281922,0.6375,0.52945323395123
9+
"8",0,1100,0.594444444444444,0.507061846431522,0.649090909090909,0.541313818682086
10+
"9",0,1200,0.592777777777778,0.506515528498998,0.649772727272727,0.542754183124284
11+
"10",0,1300,0.607777777777778,0.512213115965699,0.667045454545454,0.556357942742845
12+
"11",0,1400,0.622222222222222,0.521495171580401,0.685,0.570504096998696
13+
"12",0,1500,0.611666666666667,0.523714458045979,0.672045454545455,0.562801404886591
14+
"13",0,1600,0.617222222222222,0.530519446604271,0.676363636363636,0.570829022201779
15+
"14",0,1700,0.630555555555556,0.535338688548613,0.692045454545455,0.579729936385024
16+
"15",0,1800,0.629444444444444,0.542541323142045,0.698409090909091,0.583986255059468
17+
"16",0,1900,0.625,0.542397609891922,0.69,0.580757812388159
18+
"17",0,2000,0.627777777777778,0.552830891023163,0.696590909090909,0.587515970294976
19+
"18",0,2100,0.632777777777778,0.561790538799719,0.700909090909091,0.591470725047821
20+
"19",0,2200,0.627222222222222,0.566065107959613,0.696136363636364,0.592838584094313
21+
"20",0,2300,0.623888888888889,0.574170699194382,0.693863636363636,0.591297964201674
22+
"21",0,2400,0.622222222222222,0.565694277173436,0.689318181818182,0.59041792482338
23+
"22",0,2500,0.622777777777778,0.565388998974683,0.694318181818182,0.589638650256154
24+
"23",0,2600,0.618333333333333,0.566889869261789,0.685227272727273,0.588397030429622
25+
"24",0,2700,0.623333333333333,0.564712465520854,0.685909090909091,0.589013204622039
26+
"25",0,2800,0.630555555555556,0.571106139162416,0.702272727272727,0.597352816608173
27+
"26",0,2900,0.630555555555556,0.572646511119486,0.701818181818182,0.598187064596545
28+
"27",0,3000,0.623333333333333,0.559978237084413,0.689772727272727,0.587061879089773
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
"","Cull","MFW","Acc","Prec","Recall","FScore"
2+
"1",0,0,0,0,0,0
3+
"2",0,500,0.553333333333333,0.468965802653616,0.615681818181818,0.508663889491787
4+
"3",0,600,0.541666666666667,0.462043194412632,0.596818181818182,0.496975668037242
5+
"4",0,700,0.561111111111111,0.472208909423147,0.62,0.513569860216849
6+
"5",0,800,0.567777777777778,0.490184663651787,0.634772727272727,0.532728945257861
7+
"6",0,900,0.556111111111111,0.474820985515492,0.616363636363636,0.515563387134152
8+
"7",0,1000,0.559444444444444,0.486755341849056,0.615909090909091,0.519459781917947
9+
"8",0,1100,0.555,0.493134078283794,0.614090909090909,0.518740196059201
10+
"9",0,1200,0.553888888888889,0.494956698772908,0.617272727272727,0.518520663881043
11+
"10",0,1300,0.563888888888889,0.495456599279077,0.622045454545455,0.525966599814055
12+
"11",0,1400,0.56,0.510321954286447,0.624090909090909,0.535119820619195
13+
"12",0,1500,0.572777777777778,0.526673937923773,0.6425,0.550061333997066
14+
"13",0,1600,0.561111111111111,0.505739936055324,0.627954545454545,0.531618617915197
15+
"14",0,1700,0.552777777777778,0.495800926179512,0.612954545454545,0.523086262816631
16+
"15",0,1800,0.571111111111111,0.520244428558704,0.637045454545455,0.542519207456255
17+
"16",0,1900,0.563888888888889,0.507021185479421,0.621590909090909,0.532803254745669
18+
"17",0,2000,0.569444444444444,0.520343010867672,0.636818181818182,0.547494852871178
19+
"18",0,2100,0.565,0.517643290828336,0.629545454545455,0.538491560986838
20+
"19",0,2200,0.562777777777778,0.525519807300425,0.633181818181818,0.541111231128672
21+
"20",0,2300,0.560555555555556,0.515132047733309,0.622045454545455,0.53269237319788
22+
"21",0,2400,0.556111111111111,0.518594566554901,0.614545454545455,0.529066903975926
23+
"22",0,2500,0.558333333333333,0.509330434214452,0.618181818181818,0.529919999918858
24+
"23",0,2600,0.557222222222222,0.50839251076083,0.6125,0.522984156745081
25+
"24",0,2700,0.561666666666667,0.515802436153741,0.620227272727273,0.5317634967877
26+
"25",0,2800,0.551666666666667,0.505019801420831,0.614545454545455,0.524709382255298
27+
"26",0,2900,0.575555555555556,0.526824012136999,0.637954545454545,0.546780048780177
28+
"27",0,3000,0.566666666666667,0.52370769384044,0.6275,0.535010306103089

0 commit comments

Comments
 (0)