Skip to content

Commit 332f18b

Browse files
authored
Add files via upload
1 parent 256f4e6 commit 332f18b

File tree

1 file changed

+172
-0
lines changed

1 file changed

+172
-0
lines changed

Diff for: script_covid_preprocessing.R

+172
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
######################################
2+
#############PRE-PROCESSING###########
3+
######################################
4+
5+
library(dada2)
6+
library(phyloseq)
7+
8+
path <- "/path/to/reads"
9+
fns <- list.files(path)
10+
fns
11+
12+
fastqs <- fns[grepl(".fq$", fns)]
13+
fastqs <- sort(fastqs) # Sort ensures forward/reverse reads are in same order
14+
#### make sure that R1 is for forward read and R2 for reverse!!!!
15+
16+
fnFs <- fastqs[grepl(".1.fq", fastqs)] # Just the forward read files
17+
fnRs <- fastqs[grepl(".2.fq", fastqs)] # Just the reverse read files
18+
# Get sample names from the first part of the forward read filenames
19+
sample.names <- sapply(strsplit(fnFs, ".1.fq"), `[`, 1) ## check if is 1 or 2!
20+
21+
22+
# Fully specify the path for the fnFs and fnRs
23+
fnFs <- file.path(path, fnFs)
24+
fnRs <- file.path(path, fnRs)
25+
###############
26+
pdf("plotQualityProfile.pdf", onefile=T)
27+
plotQualityProfile(fnFs[1:10]) ## remove 20 plus 10
28+
plotQualityProfile(fnRs[1:10]) ## remove 20 plus 10
29+
dev.off()
30+
#### remove primers
31+
filt_path <- file.path(path, "filtered") # Place filtered files in filtered/ subdirectory
32+
filtFs <- file.path(filt_path, paste0(sample.names, "_F_filt.fastq.gz"))
33+
filtRs <- file.path(filt_path, paste0(sample.names, "_R_filt.fastq.gz"))
34+
35+
# Filter #### important remove primers!!! and remove low quality regions
36+
37+
out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(130,200), ##150
38+
trimLeft=c(30, 30),
39+
maxN=0, maxEE=c(2,2), truncQ=11, rm.phix=TRUE,
40+
compress=TRUE, multithread=TRUE) #
41+
head(out)
42+
43+
###############
44+
pdf("plotQualityProfile.filt.pdf", onefile=T)
45+
plotQualityProfile(filtFs[1:15])
46+
plotQualityProfile(filtRs[1:15])
47+
dev.off()
48+
49+
set.seed(12345)
50+
51+
# Learn forward error rates
52+
errF <- learnErrors(filtFs, nread=1e6, multithread=TRUE)
53+
# Learn reverse error rates
54+
errR <- learnErrors(filtRs, nread=1e6, multithread=TRUE)
55+
# Sample inference and merger of paired-end reads
56+
mergers <- vector("list", length(sample.names))
57+
names(mergers) <- sample.names
58+
59+
pdf("plotErrors_F.pdf", onefile=T)
60+
plotErrors(errF, nominalQ=TRUE)
61+
dev.off()
62+
63+
pdf("plotErrors_R.pdf", onefile=T)
64+
plotErrors(errR, nominalQ=TRUE)
65+
dev.off()
66+
67+
68+
derepRs <- derepFastq(filtRs, verbose=TRUE)
69+
derepFs <- derepFastq(filtFs, verbose=TRUE)
70+
71+
# Name the derep-class objects by the sample names
72+
names(derepFs) <- sample.names
73+
names(derepRs) <- sample.names
74+
dadaFs <- dada(derepFs, err=errF, multithread=TRUE)
75+
dadaRs <- dada(derepRs, err=errR, multithread=TRUE)
76+
77+
dadaFs[[1]]
78+
79+
mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=TRUE)
80+
# Inspect the merger data.frame from the first sample
81+
head(mergers[[1]])
82+
83+
84+
# Construct sequence table and remove chimeras
85+
86+
seqtab <- makeSequenceTable(mergers)
87+
88+
dim(seqtab)
89+
90+
# Inspect distribution of sequence lengths
91+
table(nchar(getSequences(seqtab)))
92+
93+
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)
94+
dim(seqtab.nochim)
95+
96+
sum(seqtab.nochim)/sum(seqtab)
97+
98+
getN <- function(x) sum(getUniques(x))
99+
track <- cbind(out, sapply(dadaFs, getN), sapply(mergers, getN), rowSums(seqtab), rowSums(seqtab.nochim))
100+
# If processing a single sample, remove the sapply calls: e.g. replace sapply(dadaFs, getN) with getN(dadaFs)
101+
colnames(track) <- c("input", "filtered", "denoised", "merged", "tabled", "nonchim")
102+
rownames(track) <- sample.names
103+
head(track)
104+
105+
106+
# Assign taxonomy
107+
taxHS <- assignTaxonomy(seqtab.nochim, "rdp_train_set_18.fa.gz", multithread=TRUE)
108+
taxHS <- addSpecies(taxHS, "rdp_species_assignment_18.fa.gz")
109+
colnames(taxHS) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus","Species")
110+
unname(head(taxHS))
111+
unname(tail(taxHS))
112+
113+
#ASV_names <- paste0("SVs",1:ncol(seqtab.nochim))
114+
#colnames(seqtab.nochim) <- ASV_names
115+
#rownames(taxHS) <- ASV_names
116+
117+
118+
# Write to disk
119+
write.table(track, file = "APR_track.tsv", quote=FALSE)
120+
write.table(seqtab.nochim, file = "APR_sequence_table_SV.tsv", quote=FALSE)
121+
write.table(taxHS, file = "APR_taxa_SV.tsv", quote=FALSE)
122+
123+
number_individuals <- as.data.frame(rowSums(seqtab.nochim))
124+
second_min_value <- sort(number_individuals[,1])[2]
125+
Samples_IDs <- rownames(seqtab.nochim)
126+
Sample_data_table <- as.data.frame(read.table("Metadata_table.txt", sep="\t", header=TRUE))
127+
rownames(Sample_data_table) <- Samples_IDs
128+
129+
ps <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE),
130+
sample_data(Sample_data_table),
131+
tax_table(taxHS))
132+
133+
######################################
134+
############DECONTAMINATION###########
135+
######################################
136+
137+
#remove contamination
138+
sample_data(ps)$is.neg <- sample_data(ps)$Sample_or_Control == "Control Sample"
139+
contamdf.prev <- isContaminant(ps, method="prevalence", neg="is.neg")
140+
table(contamdf.prev$contaminant)
141+
head(which(contamdf.prev$contaminant))
142+
ps.pa <- transform_sample_counts(ps, function(abund) 1*(abund>0))
143+
ps.pa.neg <- prune_samples(sample_data(ps.pa)$Sample_or_Control == "Control Sample", ps.pa)
144+
ps.pa.pos <- prune_samples(sample_data(ps.pa)$Sample_or_Control == "True Sample", ps.pa)
145+
# Make data.frame of prevalence in positive and negative samples
146+
df.pa <- data.frame(pa.pos=taxa_sums(ps.pa.pos), pa.neg=taxa_sums(ps.pa.neg),
147+
contaminant=contamdf.prev$contaminant)
148+
ggplot(data=df.pa, aes(x=pa.neg, y=pa.pos, color=contaminant)) + geom_point() +
149+
xlab("Prevalence (Negative Controls)") + ylab("Prevalence (True Samples)")
150+
151+
#remove contamination from phyloseq object
152+
contamination <- as.integer(as.logical(contamdf.prev$contaminant))
153+
contaminants <- c("1")
154+
replacements <- c("3")
155+
contamination <- replace(contamination, contamination %in% contaminants, replacements)
156+
noncontaminants <- c("0")
157+
replacements <- c("1")
158+
contamination <- replace(contamination, contamination %in% noncontaminants, replacements)
159+
contaminants <- c("3")
160+
replacements <- c("0")
161+
contamination <- replace(contamination, contamination %in% contaminants, replacements)
162+
contamination <- as.logical(as.integer(contamination))
163+
ps.nocontam = prune_taxa(contamination, ps)
164+
165+
Metadata <- as.data.frame(read.table("Metadata_table.txt", sep="\t", header=TRUE))
166+
rownames(Metadata) <- Samples_IDs
167+
Nonblanks<-as.character(Metadata$Logic)
168+
Nonblanks <- as.logical(as.integer(Nonblanks))
169+
ps.nocontam.noblanks = prune_samples(Nonblanks, ps.nocontam)
170+
171+
saveRDS(ps.nocontam.noblanks, "data/COVID_contamremoved.rdata")
172+

0 commit comments

Comments
 (0)