-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcode_v1.R
84 lines (72 loc) · 2.54 KB
/
code_v1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
setwd("~/Desktop/PG")
###############
##pre-prosess##
##############
text<-read.delim("fitness_pre-processed_TEXT(2820).txt",header=F)
text<-as.matrix(text)
label<-read.delim("fitness_Label_DATA_nlp.txt",header=F)
label<-as.matrix(label)
##text clean##
require(stringr)
for(i in 1:2820){
text[i]<-str_replace_all(text[i], "[[:punct:]]", " ")}
text<-tolower(text)
#combine data as a data frame
data<-data.frame(cbind(text,label))
names(data)<-c("text","label")
##devide train and test sets##
ind<-sample.int(2820,2250,replace=FALSE)
train<-data[ind,]
test<-data[-ind,]
##data explore
length<-str_length(text)
hist(length)
#make dictionary
require(RTextTools)
require(useful)
require(plyr)
doc_matrix.train<-create_matrix(train[,1],language="english",removeNumbers=T,removePunctuation=T,removeStopwords=T,stemWords=T)
doc_matrix.test<-create_matrix(test[,1],language="english",removeNumbers=T,removePunctuation=T,removeStopwords=T,stemWords=T)
doc_matrix.all<-create_matrix(data[,1],language="english",removeNumbers=T,removePunctuation=T,removeStopwords=T,stemWords=T)
train_in_test<-which(colnames(doc_matrix.train) %in% colnames(doc_matrix.test))
test_in_train<-which(colnames(doc_matrix.test) %in% colnames(doc_matrix.train))
text.train<-as.matrix(doc_matrix.train)
text.test<-as.matrix(doc_matrix.test)
text.train<-as.matrix(doc_matrix.train[,train_in_test])
text.test<-as.matrix(doc_matrix.test[,test_in_train])
text.X.train<-1*(text.train>0)
text.X.test<-1*(text.test>0)
##########################
##naive bayes classifier##
##########################
#build parameters
a<-list(which(train$label==0),which(train$label==1))
n.c<-c(length(a[[1]]),length(a[[2]]))
theta.c<-n.c/2250
n.jc<-matrix(0,nrow=2,ncol=6153)
for (i in 1:2){
n.jc[i,]<-colSums(text.X.train[a[[i]],])
}
#naive bayes algorithem
naive.Bayes<-function(alpha,theta.c,n.jc,n.c,X){
theta.jc<-matrix(0,nrow=2,ncol=dim(X)[2])
for ( i in 1:2){
theta.jc[i,]<-(n.jc[i,]+alpha)/(n.c[i]+alpha/length(X[,1]))
}
weight<-apply(theta.jc,2,function(x) log(x[2]/(1-x[2]))-log(x[1]/(1-x[1])))
prob<-apply(X,1,function(x) x%*%weight+log(theta.c[2]/theta.c[1]))
return(prob)
}
#get prediction on testing set
label.nb<-naive.Bayes(alpha=1,theta.c=theta.c,n.jc=n.jc,n.c=n.c,X=text.X.test)
pred.nb<-rep(1,570)
pred.nb[which(label.nb<=0)]<-0
test.label<-as.numeric(as.matrix(test[,2]))
table(test.label,pred.nb)
prob.nb<-as.numeric(label.nb )
#compute accuracy
acu.nb<-sum(pred.nb==test.label)/570
acu.nb
#write out results
finalresult<-cbind(test,pred.nb,prob.nb)
write.csv(finalresult,file="testoutput.csv",row.names=F)