Skip to content

Commit 738e222

Browse files
authored
Add files via upload
1 parent 0e2a4f7 commit 738e222

39 files changed

+348061
-3
lines changed

DAS_Tool.sh

+336
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,336 @@
1+
#!/bin/bash
2+
3+
# DAS Tool for genome-resolved metagenomics
4+
# by Christian MK Sieber ([email protected])
5+
#
6+
#
7+
# DAS Tool Copyright (c) 2017, The Regents of the University of California, through Lawrence
8+
# Berkeley National Laboratory (subject to receipt of any required approvals from the U.S.
9+
# Dept. of Energy). All rights reserved.
10+
#
11+
# If you have questions about your rights to use or distribute this software, please contact
12+
# Berkeley Lab's Innovation and Partnerships department at [email protected] referring to
13+
# "DAS Tool (2017-024)".
14+
#
15+
# NOTICE. This software was developed under funding from the U.S. Department of Energy. As
16+
# such, the U.S. Government has been granted for itself and others acting on its behalf a
17+
# paid-up, nonexclusive, irrevocable, worldwide license in the Software to reproduce,
18+
# prepare derivative works, and perform publicly and display publicly. The U.S. Government
19+
# is granted for itself and others acting on its behalf a paid-up, nonexclusive,
20+
# irrevocable, worldwide license in the Software to reproduce, prepare derivative works,
21+
# distribute copies to the public, perform publicly and display publicly, and to permit
22+
# others to do so.
23+
#
24+
25+
SOURCE="${BASH_SOURCE[0]}"
26+
while [ -h "$SOURCE" ]; do
27+
DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
28+
SOURCE="$(readlink "$SOURCE")"
29+
[[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE"
30+
done
31+
DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
32+
33+
thisdir=$(pwd)
34+
35+
function display_version() {
36+
echo
37+
echo "DAS Tool version 1.0"
38+
echo
39+
exit 1
40+
}
41+
42+
43+
function display_help() {
44+
echo " "
45+
echo "DAS Tool version 1.0"
46+
echo " "
47+
echo "Usage: DAS_Tool.sh -i methodA.scaffolds2bin,...,methodN.scaffolds2bin"
48+
echo " -l methodA,...,methodN -c contigs.fa -o myOutput"
49+
echo
50+
echo " -i, --bins Comma separated list of tab separated scaffolds to bin tables."
51+
echo " -c, --contigs Contigs in fasta format."
52+
echo " -o, --outputbasename Basename of output files."
53+
echo " -l, --labels Comma separated list of binning prediction names. (optional)"
54+
echo " --search_engine Engine used for single copy gene identification [blast/diamond/usearch]."
55+
echo " (default: usearch)"
56+
echo " --write_bin_evals Write evaluation for each input bin set [0/1]. (default: 1)"
57+
echo " --create_plots Create binning performance plots [0/1]. (default: 1)"
58+
echo " --write_bins Export bins as fasta files [0/1]. (default: 0)"
59+
echo " --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo)."
60+
echo " Gene prediction step will be skipped if given. (optional)"
61+
echo " --score_threshold Score threshold until selection algorithm will keep selecting bins [0..1]."
62+
echo " (default: 0.1)"
63+
echo " -t, --threads Number of threads to use. (default: 1)"
64+
echo " -v, --version Print version number and exit."
65+
echo " -h, --help Show this message."
66+
echo
67+
echo "Example 1: Run DAS Tool on binning predictions of MetaBAT, MaxBin, CONCOCT and tetraESOMs. Output files will start with the prefix DASToolRun1: "
68+
echo " ./DAS_Tool.sh -i sample_data/sample.human.gut_concoct_scaffolds2bin.tsv,sample_data/sample.human.gut_maxbin2_scaffolds2bin.tsv,sample_data/sample.human.gut_metabat_scaffolds2bin.tsv,sample_data/sample.human.gut_tetraESOM_scaffolds2bin.tsv -l concoct,maxbin,metabat,tetraESOM -c sample_data/sample.human.gut_contigs.fa -o sample_output/DASToolRun1"
69+
echo
70+
echo "Example 2: Run DAS Tool again with different parameters. Use the proteins predicted in Example 1 to skip the gene prediction step. Set the number of threads to 2 and score threshold to 0.6. Output files will start with the prefix DASToolRun2: "
71+
echo " ./DAS_Tool.sh -i sample_data/sample.human.gut_concoct_scaffolds2bin.tsv,sample_data/sample.human.gut_maxbin2_scaffolds2bin.tsv,sample_data/sample.human.gut_metabat_scaffolds2bin.tsv,sample_data/sample.human.gut_tetraESOM_scaffolds2bin.tsv -l concoct,maxbin,metabat,tetraESOM -c sample_data/sample.human.gut_contigs.fa -o sample_output/DASToolRun2 --threads 2 --score_threshold 0.6 --proteins sample_output/DASToolRun1_proteins.faa"
72+
echo " "
73+
echo " "
74+
echo "Please cite: https://doi.org/10.1101/107789"
75+
echo " "
76+
echo " "
77+
exit 1
78+
}
79+
80+
[ $# -eq 0 ] && { display_help ; exit 1; }
81+
82+
binlabels="NULL"
83+
mappedreads="NULL"
84+
coverage_table="NULL"
85+
debug="FALSE"
86+
use_gc=1
87+
use_N50=1
88+
executed_prodigal=0
89+
score_threshold=0.1
90+
threads=1
91+
write_bin_evals=1
92+
create_plots=1
93+
write_bins=0
94+
search_engine="usearch"
95+
96+
while [ "$1" != "" ]; do
97+
case $1 in
98+
-i | --bins ) shift
99+
scaffoldstobins=$1
100+
;;
101+
-l | --labels ) shift
102+
binlabels=$1
103+
;;
104+
-c | --contigs ) shift
105+
contigs=$1
106+
;;
107+
-o | --outputbasename ) shift
108+
outputbasename=$1
109+
;;
110+
--write_bin_evals ) shift
111+
write_bin_evals=$1
112+
;;
113+
--create_plots ) shift
114+
create_plots=$1
115+
;;
116+
--write_bins ) shift
117+
write_bins=$1
118+
;;
119+
--proteins ) shift
120+
proteins=$1
121+
;;
122+
--score_threshold ) shift
123+
score_threshold=$1
124+
;;
125+
--search_engine ) shift
126+
search_engine=$1
127+
;;
128+
-t | --threads ) shift
129+
threads=$1
130+
;;
131+
-v | --version ) shift
132+
display_version
133+
exit
134+
;;
135+
--debug ) shift
136+
debug="TRUE"
137+
;;
138+
-h | --help ) display_help
139+
exit
140+
;;
141+
* ) display_help
142+
exit 1
143+
esac
144+
shift
145+
done
146+
147+
148+
if [ "$write_bins" == "0" ] || [ "$write_bins" == "false" ] || [ "$write_bins" == "False" ] || [ "$write_bins" == "f" ]; then
149+
write_bins="FALSE"
150+
else
151+
write_bins="TRUE"
152+
fi
153+
if [ "$write_bin_evals" == "0" ] || [ "$write_bin_evals" == "false" ] || [ "$write_bin_evals" == "False" ] || [ "$write_bin_evals" == "f" ]; then
154+
write_bin_evals="FALSE"
155+
else
156+
write_bin_evals="TRUE"
157+
fi
158+
if [ "$create_plots" == "0" ] || [ "$create_plots" == "false" ] || [ "$create_plots" == "False" ] || [ "$create_plots" == "f" ]; then
159+
create_plots="FALSE"
160+
else
161+
create_plots="TRUE"
162+
fi
163+
164+
165+
#
166+
# 0.1 Load dependencies
167+
#
168+
PATH="$PATH:$DASTOOL_PULLSEQ"
169+
PATH="$PATH:$DASTOOL_PRODIGAL"
170+
PATH="$PATH:$DASTOOL_DIAMOND"
171+
PATH="$PATH:$DASTOOL_BLAST"
172+
PATH="$PATH:$DASTOOL_USEARCH"
173+
174+
175+
#check search engine
176+
if [ "$search_engine" == "diamond" ] || [ "$search_engine" == "DIAMOND" ] || [ "$search_engine" == "Diamond" ] || [ "$search_engine" == "d" ] || [ "$search_engine" == "D" ]; then
177+
command -v diamond >/dev/null 2>&1 || { echo >&2 "Can't find diamond. Aborting."; exit 1; }
178+
search_engine="diamond"
179+
fi
180+
if [ "$search_engine" == "blast" ] || [ "$search_engine" == "BLAST" ] || [ "$search_engine" == "Blast" ] || [ "$search_engine" == "b" ] || [ "$search_engine" == "B" ] || [ "$search_engine" == "blastp" ]; then
181+
command -v makeblastdb >/dev/null 2>&1 || { echo >&2 "Can't find makeblastdb. Aborting."; exit 1; }
182+
command -v blastp >/dev/null 2>&1 || { echo >&2 "Can't find blastp. Aborting."; exit 1; }
183+
search_engine="blast"
184+
fi
185+
if [ "$search_engine" == "usearch" ] || [ "$search_engine" == "USEARCH" ] || [ "$search_engine" == "UBLAST" ] || [ "$search_engine" == "Usearch" ] || [ "$search_engine" == "u" ] || [ "$search_engine" == "U" ] || [ "$search_engine" == "ublast" ] || [ "$search_engine" == "Ublast" ]; then
186+
command -v usearch >/dev/null 2>&1 || { echo >&2 "Can't find usearch. Aborting."; exit 1; }
187+
search_engine="usearch"
188+
fi
189+
190+
#
191+
# 0.2 Check existence of files
192+
#
193+
if [ ! -f $contigs ]
194+
then
195+
echo contig file not found: $contigs
196+
echo stopping
197+
exit 1
198+
fi
199+
200+
if [ ! -z "$proteins" ] && [ ! -f $proteins ]
201+
then
202+
echo proteins file not found: $proteins
203+
echo stopping
204+
exit 1
205+
fi
206+
207+
if [ "$coverage_table" != "NULL" ] && [ ! -f $coverage_table ]
208+
then
209+
echo coverage table not found: $coverage_table
210+
echo stopping
211+
exit 1
212+
fi
213+
214+
if [ "$mappedreads" != "NULL" ] && [ ! -f $mappedreads ]
215+
then
216+
echo bam file not found: $mappedreads
217+
echo stopping
218+
exit 1
219+
fi
220+
221+
for i in $(echo $scaffoldstobins | tr "," " ")
222+
do
223+
if [ ! -f $i ];
224+
then
225+
echo "scaffolds2bin file not found: $i "
226+
exit 1
227+
fi
228+
done
229+
230+
231+
232+
#
233+
# 1. Run Prodigal
234+
#
235+
if [ -z "$proteins" ]
236+
then
237+
238+
#check if dependencies for gene prediction are installed
239+
command -v pullseq >/dev/null 2>&1 || { echo >&2 "Can't find pullseq. Aborting."; exit 1; }
240+
command -v prodigal >/dev/null 2>&1 || { echo >&2 "Can't find prodigal. Aborting."; exit 1; }
241+
242+
proteins=$outputbasename\_proteins.faa
243+
244+
echo "predicting genes using prodigal"
245+
grep ">" $contigs | perl -pe 's/>//g;' | shuf > dastoolthreadstmpheader
246+
dastoolthreadstmpheader=dastoolthreadstmpheader
247+
total_lines=$(wc -l <${dastoolthreadstmpheader})
248+
((lines_per_file = (total_lines + threads - 1) / threads))
249+
split --numeric-suffixes --lines=${lines_per_file} ${dastoolthreadstmpheader} dastoolthreadstmp.
250+
rm dastoolthreadstmpheader
251+
252+
for i in dastoolthreadstmp.*
253+
do
254+
pullseq --input $contigs --names $i > $i\_contigs.fa
255+
prodigal -a $i\_partialprot.faa -i $i\_contigs.fa -p meta -m -q > /dev/null 2>&1 &
256+
done
257+
wait
258+
cat *_partialprot.faa > $proteins
259+
rm dastoolthreadstmp.*
260+
executed_prodigal=1
261+
elif [ ! -f $proteins ]
262+
then
263+
echo protein file not found: $proteins
264+
echo stopping
265+
exit 1
266+
fi
267+
268+
269+
270+
#
271+
# 2. Predict Single Copy Genes
272+
#
273+
bscg=$proteins\.bacteria.scg
274+
ascg=$proteins\.archaea.scg
275+
if [ ! -f $bscg ] || [ ! -f $ascg ] || [ $executed_prodigal == 1 ]; then
276+
echo "identifying single copy genes using $search_engine"
277+
#predict bacterial SCGs
278+
ruby $DIR\/src/scg_blank_$search_engine\.rb $search_engine $proteins $DIR\/db/bac.all.faa $DIR\/db/bac.scg.faa $DIR\/db/bac.scg.lookup $threads > /dev/null 2>&1
279+
mv $proteins\.scg $bscg
280+
#predict archaeal SCGs
281+
ruby $DIR\/src/scg_blank_$search_engine\.rb $search_engine $proteins $DIR\/db/arc.all.faa $DIR\/db/arc.scg.faa $DIR\/db/arc.scg.lookup $threads > /dev/null 2>&1
282+
mv $proteins\.scg $ascg
283+
rm $proteins\.findSCG.b6 $proteins\.scg.candidates.faa $proteins\.all.b6
284+
else
285+
echo found predicted single copy genes:
286+
echo " " $bscg
287+
echo " " $ascg
288+
echo skipping single copy gene identification
289+
fi
290+
291+
292+
293+
#
294+
# 3. Calculate Contig Length
295+
#
296+
length_table=$outputbasename\.seqlength
297+
awk '/^>/ {if (x){print x}; print ;x=0;next; } {x += length($0)}END{print x}' $contigs | sed -e ':a' -e 'N' -e '$!ba' -e 's/\n\([^>]\)/?\1/g' | tr '?' '\t' | sed 's/>//g' > $length_table
298+
299+
300+
301+
#
302+
# 4. Run DAS Tool
303+
#
304+
cd $DIR
305+
Rscript $DIR\/src/DAS_Tool.R $scaffoldstobins $binlabels $contigs $bscg $ascg $outputbasename $score_threshold $threads $thisdir $length_table $write_bin_evals $create_plots $debug
306+
cd $thisdir
307+
308+
309+
310+
#
311+
# 5. Extract Bins
312+
#
313+
if [ "$write_bins" == "TRUE" ]; then
314+
scaf2bin=$outputbasename\_DASTool_scaffolds2bin.txt
315+
if test -f $scaf2bin; then
316+
317+
bin_folder=$outputbasename\_DASTool_bins
318+
if [ -d "$bin_folder" ]; then
319+
rm -r $bin_folder
320+
fi
321+
mkdir $bin_folder
322+
323+
echo extracting bins to $bin_folder
324+
while read scaffold bin
325+
do
326+
echo $scaffold >> $bin_folder\/$bin\.ctgtmp
327+
done < $scaf2bin
328+
329+
for i in $bin_folder\/*.ctgtmp
330+
do
331+
bin_id=$(echo $i | perl -pe 's/\.ctgtmp//g')
332+
pullseq -i $contigs -n $i > $bin_id\.contigs.fa
333+
rm $i
334+
done
335+
fi
336+
fi

INSTALL.html

+184
Large diffs are not rendered by default.

INSTALL.md

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#Installation instructions for DAS Tool version 1.0
2+
3+
#1. Dependencies
4+
5+
- R (>= 3.2.3): https://www.r-project.org
6+
- R-packages: data.table (>= 1.9.6), doMC (>= 1.3.4), ggplot2 (>= 2.1.0)
7+
- ruby (>= v2.3.1): https://www.ruby-lang.org
8+
- Pullseq (>= 1.0.2): https://github.com/bcthomas/pullseq
9+
- Prodigal (>= 2.6.3): https://github.com/hyattpd/Prodigal
10+
- One of the following search engines:
11+
- USEARCH (>= 8.1): http://www.drive5.com/usearch/download.html
12+
- DIAMOND (>= 0.8.24): https://github.com/bbuchfink/diamond
13+
- BLAST+ (>= 2.5.0): https://blast.ncbi.nlm.nih.gov/Blast.cgi
14+
15+
16+
#2. Installation
17+
18+
```
19+
# Download and extract DASTool.zip archive:
20+
unzip DAS_Tool.v1.0.zip
21+
cd ./DAS_Tool.v1.0
22+
23+
# Install R-packages:
24+
R CMD INSTALL ./package/DASTool_1.0.0.tar.gz
25+
26+
# Call DAS Tool:
27+
./DAS_Tool.sh -h
28+
```
29+
For detaile instructions please read the documentation.

README.html

+232
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)