|
| 1 | +###################################### |
| 2 | +#############PRE-PROCESSING########### |
| 3 | +###################################### |
| 4 | + |
| 5 | +library(dada2) |
| 6 | +library(phyloseq) |
| 7 | + |
| 8 | +path <- "/path/to/reads" |
| 9 | +fns <- list.files(path) |
| 10 | +fns |
| 11 | + |
| 12 | +fastqs <- fns[grepl(".fq$", fns)] |
| 13 | +fastqs <- sort(fastqs) # Sort ensures forward/reverse reads are in same order |
| 14 | +#### make sure that R1 is for forward read and R2 for reverse!!!! |
| 15 | + |
| 16 | +fnFs <- fastqs[grepl(".1.fq", fastqs)] # Just the forward read files |
| 17 | +fnRs <- fastqs[grepl(".2.fq", fastqs)] # Just the reverse read files |
| 18 | +# Get sample names from the first part of the forward read filenames |
| 19 | +sample.names <- sapply(strsplit(fnFs, ".1.fq"), `[`, 1) ## check if is 1 or 2! |
| 20 | + |
| 21 | + |
| 22 | +# Fully specify the path for the fnFs and fnRs |
| 23 | +fnFs <- file.path(path, fnFs) |
| 24 | +fnRs <- file.path(path, fnRs) |
| 25 | +############### |
| 26 | +pdf("plotQualityProfile.pdf", onefile=T) |
| 27 | +plotQualityProfile(fnFs[1:10]) ## remove 20 plus 10 |
| 28 | +plotQualityProfile(fnRs[1:10]) ## remove 20 plus 10 |
| 29 | +dev.off() |
| 30 | +#### remove primers |
| 31 | +filt_path <- file.path(path, "filtered") # Place filtered files in filtered/ subdirectory |
| 32 | +filtFs <- file.path(filt_path, paste0(sample.names, "_F_filt.fastq.gz")) |
| 33 | +filtRs <- file.path(filt_path, paste0(sample.names, "_R_filt.fastq.gz")) |
| 34 | + |
| 35 | +# Filter #### important remove primers!!! and remove low quality regions |
| 36 | + |
| 37 | +out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(130,200), ##150 |
| 38 | + trimLeft=c(30, 30), |
| 39 | + maxN=0, maxEE=c(2,2), truncQ=11, rm.phix=TRUE, |
| 40 | + compress=TRUE, multithread=TRUE) # |
| 41 | +head(out) |
| 42 | + |
| 43 | +############### |
| 44 | +pdf("plotQualityProfile.filt.pdf", onefile=T) |
| 45 | +plotQualityProfile(filtFs[1:15]) |
| 46 | +plotQualityProfile(filtRs[1:15]) |
| 47 | +dev.off() |
| 48 | + |
| 49 | +set.seed(12345) |
| 50 | + |
| 51 | +# Learn forward error rates |
| 52 | +errF <- learnErrors(filtFs, nread=1e6, multithread=TRUE) |
| 53 | +# Learn reverse error rates |
| 54 | +errR <- learnErrors(filtRs, nread=1e6, multithread=TRUE) |
| 55 | +# Sample inference and merger of paired-end reads |
| 56 | +mergers <- vector("list", length(sample.names)) |
| 57 | +names(mergers) <- sample.names |
| 58 | + |
| 59 | +pdf("plotErrors_F.pdf", onefile=T) |
| 60 | +plotErrors(errF, nominalQ=TRUE) |
| 61 | +dev.off() |
| 62 | + |
| 63 | +pdf("plotErrors_R.pdf", onefile=T) |
| 64 | +plotErrors(errR, nominalQ=TRUE) |
| 65 | +dev.off() |
| 66 | + |
| 67 | + |
| 68 | +derepRs <- derepFastq(filtRs, verbose=TRUE) |
| 69 | +derepFs <- derepFastq(filtFs, verbose=TRUE) |
| 70 | + |
| 71 | +# Name the derep-class objects by the sample names |
| 72 | +names(derepFs) <- sample.names |
| 73 | +names(derepRs) <- sample.names |
| 74 | +dadaFs <- dada(derepFs, err=errF, multithread=TRUE) |
| 75 | +dadaRs <- dada(derepRs, err=errR, multithread=TRUE) |
| 76 | + |
| 77 | +dadaFs[[1]] |
| 78 | + |
| 79 | +mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=TRUE) |
| 80 | +# Inspect the merger data.frame from the first sample |
| 81 | +head(mergers[[1]]) |
| 82 | + |
| 83 | + |
| 84 | +# Construct sequence table and remove chimeras |
| 85 | + |
| 86 | +seqtab <- makeSequenceTable(mergers) |
| 87 | + |
| 88 | +dim(seqtab) |
| 89 | + |
| 90 | +# Inspect distribution of sequence lengths |
| 91 | +table(nchar(getSequences(seqtab))) |
| 92 | + |
| 93 | +seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE) |
| 94 | +dim(seqtab.nochim) |
| 95 | + |
| 96 | +sum(seqtab.nochim)/sum(seqtab) |
| 97 | + |
| 98 | +getN <- function(x) sum(getUniques(x)) |
| 99 | +track <- cbind(out, sapply(dadaFs, getN), sapply(mergers, getN), rowSums(seqtab), rowSums(seqtab.nochim)) |
| 100 | +# If processing a single sample, remove the sapply calls: e.g. replace sapply(dadaFs, getN) with getN(dadaFs) |
| 101 | +colnames(track) <- c("input", "filtered", "denoised", "merged", "tabled", "nonchim") |
| 102 | +rownames(track) <- sample.names |
| 103 | +head(track) |
| 104 | + |
| 105 | + |
| 106 | +# Assign taxonomy |
| 107 | +taxHS <- assignTaxonomy(seqtab.nochim, "rdp_train_set_18.fa.gz", multithread=TRUE) |
| 108 | +taxHS <- addSpecies(taxHS, "rdp_species_assignment_18.fa.gz") |
| 109 | +colnames(taxHS) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus","Species") |
| 110 | +unname(head(taxHS)) |
| 111 | +unname(tail(taxHS)) |
| 112 | + |
| 113 | +#ASV_names <- paste0("SVs",1:ncol(seqtab.nochim)) |
| 114 | +#colnames(seqtab.nochim) <- ASV_names |
| 115 | +#rownames(taxHS) <- ASV_names |
| 116 | + |
| 117 | + |
| 118 | +# Write to disk |
| 119 | +write.table(track, file = "APR_track.tsv", quote=FALSE) |
| 120 | +write.table(seqtab.nochim, file = "APR_sequence_table_SV.tsv", quote=FALSE) |
| 121 | +write.table(taxHS, file = "APR_taxa_SV.tsv", quote=FALSE) |
| 122 | + |
| 123 | +number_individuals <- as.data.frame(rowSums(seqtab.nochim)) |
| 124 | +second_min_value <- sort(number_individuals[,1])[2] |
| 125 | +Samples_IDs <- rownames(seqtab.nochim) |
| 126 | +Sample_data_table <- as.data.frame(read.table("Metadata_table.txt", sep="\t", header=TRUE)) |
| 127 | +rownames(Sample_data_table) <- Samples_IDs |
| 128 | + |
| 129 | +ps <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE), |
| 130 | + sample_data(Sample_data_table), |
| 131 | + tax_table(taxHS)) |
| 132 | + |
| 133 | +###################################### |
| 134 | +############DECONTAMINATION########### |
| 135 | +###################################### |
| 136 | + |
| 137 | +#remove contamination |
| 138 | +sample_data(ps)$is.neg <- sample_data(ps)$Sample_or_Control == "Control Sample" |
| 139 | +contamdf.prev <- isContaminant(ps, method="prevalence", neg="is.neg") |
| 140 | +table(contamdf.prev$contaminant) |
| 141 | +head(which(contamdf.prev$contaminant)) |
| 142 | +ps.pa <- transform_sample_counts(ps, function(abund) 1*(abund>0)) |
| 143 | +ps.pa.neg <- prune_samples(sample_data(ps.pa)$Sample_or_Control == "Control Sample", ps.pa) |
| 144 | +ps.pa.pos <- prune_samples(sample_data(ps.pa)$Sample_or_Control == "True Sample", ps.pa) |
| 145 | +# Make data.frame of prevalence in positive and negative samples |
| 146 | +df.pa <- data.frame(pa.pos=taxa_sums(ps.pa.pos), pa.neg=taxa_sums(ps.pa.neg), |
| 147 | + contaminant=contamdf.prev$contaminant) |
| 148 | +ggplot(data=df.pa, aes(x=pa.neg, y=pa.pos, color=contaminant)) + geom_point() + |
| 149 | + xlab("Prevalence (Negative Controls)") + ylab("Prevalence (True Samples)") |
| 150 | + |
| 151 | +#remove contamination from phyloseq object |
| 152 | +contamination <- as.integer(as.logical(contamdf.prev$contaminant)) |
| 153 | +contaminants <- c("1") |
| 154 | +replacements <- c("3") |
| 155 | +contamination <- replace(contamination, contamination %in% contaminants, replacements) |
| 156 | +noncontaminants <- c("0") |
| 157 | +replacements <- c("1") |
| 158 | +contamination <- replace(contamination, contamination %in% noncontaminants, replacements) |
| 159 | +contaminants <- c("3") |
| 160 | +replacements <- c("0") |
| 161 | +contamination <- replace(contamination, contamination %in% contaminants, replacements) |
| 162 | +contamination <- as.logical(as.integer(contamination)) |
| 163 | +ps.nocontam = prune_taxa(contamination, ps) |
| 164 | + |
| 165 | +Metadata <- as.data.frame(read.table("Metadata_table.txt", sep="\t", header=TRUE)) |
| 166 | +rownames(Metadata) <- Samples_IDs |
| 167 | +Nonblanks<-as.character(Metadata$Logic) |
| 168 | +Nonblanks <- as.logical(as.integer(Nonblanks)) |
| 169 | +ps.nocontam.noblanks = prune_samples(Nonblanks, ps.nocontam) |
| 170 | + |
| 171 | +saveRDS(ps.nocontam.noblanks, "data/COVID_contamremoved.rdata") |
| 172 | + |
0 commit comments