Campylobacter.Rmd

---
title: "Campylobacter"
author: "Nicole Wheeler"
date: "22 November 2016"
output: pdf_document
---

# Workflow for identifying genes under different sequence constraints in invasive Campylobacter compared to gastrointestinal close relatives. 

```{r}
# read in data and check it over

library(caret)
library(randomForest)
set.seed(1)

traindata <- read.delim("bitscores.tsv", row.names=1)
traindata <- t(traindata)
traindata <- traindata[,order(colnames(traindata))]

phenotype <- read.delim("phenotype.tsv", header=F)
phenotype[,1] <- make.names(phenotype[,1])

traindata <- cbind.data.frame(traindata, phenotype=phenotype[match(row.names(traindata), phenotype[,1]),2])

# Roary seems to have split some paralogs in a way that doesn't make much sense - remove paralogous groups
orthologs <- read.delim("gene_presence_absence.txt", stringsAsFactors = F)
nonunique <- orthologs$Non.unique.Gene.name[orthologs$Non.unique.Gene.name!=""]
nonunique2 <- orthologs$Gene[orthologs$Non.unique.Gene.name!=""]
traindata <- traindata[,is.na(match(colnames(traindata), nonunique))]
traindata <- traindata[,is.na(match(colnames(traindata), nonunique2))]
traindata <- traindata[,!is.na(match(colnames(traindata), c(orthologs$Gene[orthologs$Avg.sequences.per.isolate<1.1], "phenotype")))]

traindata[is.na(traindata)] <- 0
traindata <- traindata[,-nearZeroVar(traindata)]
names(traindata) <- make.names(names(traindata))

# also check trimming of inconsistent start sites cause by PROKKA/Prodigal
trimming <- read.table("inconsistent_starts.txt", header=F)
hist(trimming$V2)
# 368 genes have had >50 amino acids trimmed from beginning due to inconsistent starts
trimgenes <- trimming$V1[trimming$V2>20]
trimgenes <- sub(".afa", "", trimgenes)
ogsize <- vector()
for(i in trimgenes) {
  if(!is.na(match(i, colnames(traindata)))) {
      ogsize <- c(ogsize, sum(traindata[,i]>0))
  }
}
# some of the genes had already been filtered out (possibly due to being too short to have a significant model match), down to 240, 128 of which have a representative in the full 30 strains. Removing these genes because the length trimmed off is too great to ignore
traindata <- traindata[,is.na(match(colnames(traindata), trimgenes))]

```

Having read the data in and checked for consistent top model calls, we can identify 1874 orthologous groups with a single model hit, 249 with no model hit, and 40 with more than one model hit. Moving forward, we will only work with the 1874 clear cases. We're restricting the analysis to genes that have at least 10 gastrointestinal strains with representatives adn at least 5 invasive strains with representatives. 

```{r}
# calculate summary statistics for each query OG

dbsvals <- vector()
ksvals <- vector()
data2 <- data.frame()

for (i in 1:(ncol(traindata)-1)) {
  data2 <- rbind(data2, median(traindata[,i], na.rm=T)-traindata[,i])
  dbsvals[i] <- median(traindata[traindata$phenotype=="Gastrointestinal",i], na.rm=T) - median(traindata[traindata$phenotype=="Invasive",i], na.rm=T)
  ksvals[i] <- as.numeric(ks.test(traindata[traindata$phenotype=="Gastrointestinal",i],traindata[traindata$phenotype=="Invasive",i], na.rm=T)$statistic)
}

sum(is.na(dbsvals))

row.names(data2) <- colnames(traindata)[-ncol(traindata)]
colnames(data2) <- row.names(traindata)

save(traindata, data2, dbsvals, ksvals, file="data.Rdata")
```

```{r}
# summary stats for top genes

stats <- cbind.data.frame(gene=colnames(traindata)[!is.na(dbsvals)&!is.na(ksvals)&abs(dbsvals)>(quantile(abs(dbsvals), 0.8, na.rm=T))&ksvals>quantile(ksvals, 0.8, na.rm=T)], dbs=dbsvals[!is.na(dbsvals)&!is.na(ksvals)&abs(dbsvals)>(quantile(abs(dbsvals), 0.8, na.rm=T))&ksvals>quantile(ksvals, 0.8, na.rm=T)], ks=ksvals[!is.na(dbsvals)&!is.na(ksvals)&abs(dbsvals)>(quantile(abs(dbsvals), 0.8, na.rm=T))&ksvals>quantile(ksvals, 0.8, na.rm=T)])

dbsp <- vector()
ksp <- vector()

for(i in 1:nrow(stats)) {
  dbsp <- c(dbsp, sum(!is.na(dbsvals)&abs(dbsvals)>abs(stats$dbs[i]))/sum(!is.na(dbsvals)))
  ksp <- c(ksp, sum(!is.na(ksvals)&ksvals>stats$ks[i])/sum(!is.na(ksvals)))
}

Description <- orthologs$Annotation[match(stats[,1], orthologs$Gene)]

stats2 <- cbind.data.frame(stats[,1], Description, stats[,2], dbsp, ks=stats[,3], ksp)
names(stats2) <- c("Gene", "Description", "DBS", "DBS_P", "KS", "KS_P")
stats2$DBS_P <- round(stats2$DBS_P, digits=3)
stats2$KS_P <- round(stats2$KS_P, digits=3)

write.table(stats2[order(stats2$KS_P),], file="summarystats_topgenes.tsv", sep="\t", quote=F, row.names=F, col.names=T)

```


```{r}
# building a training dataset

library(rpart)
library(caret)
library(randomForest)

traintrim <- traindata[,!is.na(dbsvals)&!is.na(ksvals)&ksvals>quantile(ksvals, 0.8, na.rm=T)&abs(dbsvals)>quantile(abs(dbsvals), 0.8, na.rm=T)]
traintrim <- cbind.data.frame(traintrim, phenotype=traindata$phenotype)

save(traindata, traintrim, file="traindata.Rdata")
```


```{r}
# random forest approach

set.seed(1)
library(randomForest)

# parameters for all genes
big_rf_acc <- vector()
for(i in c(5, 10, 50, 100, 200, 500)) {
  big_rf <- randomForest(phenotype ~ ., data=traindata, ntree=10000, mtry=i, na.action=na.roughfix, importance=T, proximity=T, keep.inbag=T, sampsize=c(7,7))
  big_rf_acc <- c(big_rf_acc, (big_rf$confusion[1,2]+big_rf$confusion[2,1])/30)
  big_rf <- randomForest(phenotype ~ ., data=traindata, ntree=10000, mtry=i, na.action=na.roughfix, importance=T, proximity=T, keep.inbag=T, sampsize=c(7,7))
  big_rf_acc <- c(big_rf_acc, (big_rf$confusion[1,2]+big_rf$confusion[2,1])/30)
  big_rf <- randomForest(phenotype ~ ., data=traindata, ntree=10000, mtry=i, na.action=na.roughfix, importance=T, proximity=T, keep.inbag=T, sampsize=c(7,7))
  big_rf_acc <- c(big_rf_acc, (big_rf$confusion[1,2]+big_rf$confusion[2,1])/30)
}

png("wholemodel_mtry.png", width=600, height=500)
plot(x=c(5, 5, 5, 10, 10, 10, 50, 50, 50, 100, 100, 100, 200, 200, 200, 500, 500, 500), y=big_rf_acc)
dev.off()

model_rf_acc <- vector()
for(i in c(5, 10, 20, 30, 50, 100, 200)) {
  model_rf <- randomForest(phenotype ~ ., data=traintrim, ntree=10000, mtry=i, na.action=na.roughfix, importance=T, proximity=T, keep.inbag=T, sampsize=c(7,7))
  prec <- model_rf$confusion[2,2]/sum(model_rf$confusion[,2])
  rec <- model_rf$confusion[2,2]/sum(model_rf$confusion[2,])
  f1 <- 2*prec*rec/(prec+rec)
  model_rf_acc <- c(model_rf_acc, f1)
  model_rf <- randomForest(phenotype ~ ., data=traintrim, ntree=10000, mtry=i, na.action=na.roughfix, importance=T, proximity=T, keep.inbag=T, sampsize=c(7,7))
  prec <- model_rf$confusion[2,2]/sum(model_rf$confusion[,2])
  rec <- model_rf$confusion[2,2]/sum(model_rf$confusion[2,])
  f1 <- 2*prec*rec/(prec+rec)
  model_rf_acc <- c(model_rf_acc, f1)
  model_rf <- randomForest(phenotype ~ ., data=traintrim, ntree=10000, mtry=i, na.action=na.roughfix, importance=T, proximity=T, keep.inbag=T, sampsize=c(7,7))
  prec <- model_rf$confusion[2,2]/sum(model_rf$confusion[,2])
  rec <- model_rf$confusion[2,2]/sum(model_rf$confusion[2,])
  f1 <- 2*prec*rec/(prec+rec)
  model_rf_acc <- c(model_rf_acc, f1)
}

mtrys <- c(5, 5, 5, 10, 10, 10, 20, 20, 20, 30, 30, 30, 50, 50, 50, 100, 100, 100, 200, 200, 200)
png("trimmodel_mtry.png", width=600, height=500)
plot(x=mtrys, y=model_rf_acc)
dev.off()

big_rf <- randomForest(phenotype ~ ., data=traindata, ntree=10000, mtry=5, na.action=na.roughfix, importance=T, proximity=T, keep.inbag=T, sampsize=c(7,7))
png("imp_vs_ks.png", width=500, height=500)
plot(ksvals, big_rf$importance[,3])
dev.off()

png("all_importance.png", width=500, height=400)
plot(y=big_rf$importance[order(big_rf$importance[,3], decreasing=T),3], x=1:nrow(big_rf$importance), ylab="Importance", xlab="Gene ranking", pch=16)
dev.off()

```


```{r, model built with feature selection}
library(randomForest)
library(dplyr)
set.seed(1)

model_rf_fs <- randomForest(phenotype ~ ., data=traintrim, ntree=10000, mtry=mtrys[which.max(model_rf_acc)], importance=T, proximity=T, sampsize=c(7,7))

png("~/Documents/invasive_campylobacter/my_workflow/error_by_num_trees.png", width=700, height=500)
plot(y=model_rf_fs$err.rate[,1], x=1:length(model_rf_fs$err.rate[,1]), pch=16)
dev.off()
```


```{r}
# a control, where we feature select each time

permute <- vector()
importance <- vector()

set.seed(1)
for (i in 1:100) {
	# permute the class labels
  controlclass <- sample(traintrim$phenotype)
	# don't include permutations where the labels end up matching the real ones
	while(paste(controlclass, collapse="") == paste(traintrim$phenotype, collapse="")) {
	  controlclass <- sample(traintrim$phenotype)
	}
  
  cdbsvals <- vector()
cksvals <- vector()
cdata2 <- data.frame()

for (i in 1:(ncol(traindata)-1)) {
    cdata2 <- rbind(cdata2, median(traindata[,i], na.rm=T)-traindata[,i])
        cdbsvals[i] <- median(traindata[controlclass=="Gastrointestinal",i], na.rm=T) - median(traindata[controlclass=="Invasive",i], na.rm=T)
        cksvals[i] <- as.numeric(ks.test(traindata[controlclass=="Gastrointestinal",i],traindata[controlclass=="Invasive",i], na.rm=T)$statistic)
}

ctraintrim <- traindata[,!is.na(dbsvals)&!is.na(ksvals)&ksvals>quantile(ksvals, 0.8, na.rm=T)&abs(dbsvals)>quantile(abs(dbsvals), 0.8, na.rm=T)]
ctraintrim <- cbind.data.frame(ctraintrim, phenotype=controlclass)

	model_test <- randomForest(controlclass ~ ., data=ctraintrim, ntree=10000, mtry=mtrys[which.max(model_rf_acc)], importance=T, sampsize=c(7,7))
	permute <- c(permute, model_test$err.rate[10000,1])
	importance <- c(importance, model_test$importance[,3])
}

sum(permute<=(model_rf_fs$err.rate[nrow(model_rf_fs$err.rate),1]))/100

png("rferror_trim.png", width=450, height=350)
hist(1-permute, main="", xlab="Out-of-bag accuracy", breaks=15, xlim=c(0,1))
abline(v=1-(model_rf_fs$err.rate[nrow(model_rf_fs$err.rate),1]), col="red")
dev.off()

pdf("rferror_trim.pdf", width=6, height=5)
hist(1-permute, main="", xlab="Out-of-bag accuracy", breaks=15, xlim=c(0,1))
abline(v=1-(model_rf_fs$err.rate[nrow(model_rf_fs$err.rate),1]), col="red")
dev.off()

```


```{r}
# looking at which variables are consistently valuable

png("Featureimportance.png", width=400, height = 350)
plot(x=1:(ncol(traintrim)-1), y=model_rf_fs$importance[,3][order(model_rf_fs$importance[,3], decreasing=T)], xlab="Gene ranking", ylab="Gene importance (mean decrease in accuracy)", pch=16); abline(h=quantile(importance, 0.95), lty=2, lwd=2, col="grey"); abline(h=quantile(importance, 0.9), lty=3, lwd=2, col="grey")
dev.off()

pdf("Featureimportance.pdf", width=6, height = 5, useDingbats=F)
plot(x=1:(ncol(traintrim)-1), y=model_rf_fs$importance[,3][order(model_rf_fs$importance[,3], decreasing=T)], xlab="Gene ranking", ylab="Gene importance (mean decrease in accuracy)", pch=16); abline(h=quantile(importance, 0.95), lty=2, lwd=2, col="grey"); abline(h=quantile(importance, 0.9), lty=3, lwd=2, col="grey")
dev.off()

print(row.names(model_rf_fs$importance[model_rf_fs$importance[,3]>quantile(importance, 0.90),]))

topvars <- vector()
vis <- vector()
allvars <- vector()
allvis <- vector()

set.seed(1)

for(i in 1:100) {
  model_rf_fs_perm <- randomForest(phenotype ~ ., data=traintrim, ntree=10000, mtry=mtrys[which.max(model_rf_acc)], na.action=na.roughfix, importance=T, proximity=T, sampsize=c(7,7))
  # model_rf_fs_perm <- randomForest(phenotype ~ ., data=traintrim, ntree=10000, mtry=100, na.action=na.roughfix, importance=T, proximity=T, sampsize=c(7,7))
  # trying the two extremes of mtry doesn't dramatically affect the results

topvars <- c(topvars, row.names(model_rf_fs_perm$importance[model_rf_fs_perm$importance[,3]>quantile(importance, 0.90),]))
vis <- c(vis, model_rf_fs_perm$importance[model_rf_fs_perm$importance[,3]>quantile(importance, 0.90),3])
allvars <- c(allvars, row.names(model_rf_fs_perm$importance))
allvis <- c(allvis, model_rf_fs_perm$importance[,3])

}

table(allvars[allvis>quantile(importance, 0.90)])

topgenes <- names(table(allvars[allvis>quantile(importance, 0.90)])[table(allvars[allvis>quantile(importance, 0.90)])>75])

compare_scores <- data.frame(Gene=allvars, VI=allvis, KS=ksvals[match(allvars, colnames(traindata))])
compare_scores <- compare_scores[order(compare_scores$VI, decreasing=T),]
compare_scores$Unique_gene <- compare_scores$Gene
compare_scores$Unique_gene[duplicated(compare_scores$Unique_gene)] <- ""

mean_vis <- vector()
for(i in unique(allvars)) {
  mean_vis <- c(mean_vis, mean(allvis[allvars==i]))
}
sd_vis <- vector()
for(i in unique(allvars)) {
  sd_vis <- c(sd_vis, sd(allvis[allvars==i]))
}
vi_p <- vector()
for(i in 1:length(unique(allvars))) {
  vi_p <- c(vi_p, sum(mean_vis[i]<importance)/length(importance))
}

compare_scores2 <- data.frame(Gene=unique(allvars), Description=orthologs$Annotation[match(unique(allvars), orthologs$Gene)], Mean_VI=mean_vis, VI_P=vi_p, SD_VIs=sd_vis, Prop_top=paste(table(allvars[allvis>quantile(importance, 0.90)])[unique(allvars)]), KS=ksvals[match(unique(allvars), colnames(traindata))], DBS=dbsvals[match(unique(allvars), colnames(traindata))])
write.csv(compare_scores2[order(compare_scores2$Mean_VI, decreasing=T),], file="topgenes_rf.csv", row.names=F)

library(ggrepel)
ggplot(compare_scores2, aes(x=KS, y=log(Mean_VI), label=Gene, col=SD_VIs)) + geom_point() + geom_text_repel(size=3) + theme_bw() + xlim(0,0.6)
ggsave("ks_vs_vi_labelled.pdf", width=8, height=5)

```


```{r}
# visualising results

data <- data.frame(traindata)
data$sample <- row.names(data)
morphologies <- read.delim("sample_morphologies.txt", header=F)
data$CC <- morphologies[match(data$sample, morphologies[,1]),2]
data$morphology <- morphologies[match(data$sample, morphologies[,1]),3]
library(plyr)
data$morphology2 <- revalue(factor(data$morphology), c("1"="Helical", "2"="Elongated helical", "3"="Rod"))

ggplot(data[data$group_735>590,], aes_string(x='mreB', y='group_735', col='CC', shape='phenotype', size='morphology2', label='sample')) + geom_jitter(width=0.3, height=0.3) + ylab("Bitscore - Pgp1") + xlab("Bitscore - MreB") + theme_classic(15) 
ggsave("Figure4.png", width=8, height=6)
ggsave("Figure4.pdf", width=8, height=6, useDingbats=F)

```

```{r}
# show score distribution of random pairs of genes - they usually cluster by ST

require(gridExtra)

makeplot <- function() {ggplot(traindata, aes_string(x=sample(colnames(traindata),1), y=sample(colnames(traindata),1), col="phenotype")) + geom_jitter(width=0.2, height=0.2) + theme_bw(20) + scale_colour_manual(values=c("skyblue", "coral3"), name="") + theme(legend.position = "top") + theme(axis.text.y = element_text(angle = 90, hjust=0.5))}
plot1 <- makeplot()
plot2 <- makeplot()
plot3 <- makeplot()
plot4 <- makeplot()
plot5 <- makeplot()
plot6 <- makeplot()
plot7 <- makeplot()
plot8 <- makeplot()

pdf("Random_genes.pdf", width=14, height=14)
grid.arrange(plot1, plot2, plot3, plot4, plot5, plot6, plot7, plot8, plot8, ncol=3)
dev.off()
png("Random_genes.png", width=1000, height=1000)
grid.arrange(plot1, plot2, plot3, plot4, plot5, plot6, plot7, plot8, plot8, ncol=3)
dev.off()
```