|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# DAS Tool for genome-resolved metagenomics |
| 4 | +# by Christian MK Sieber ([email protected]) |
| 5 | +# |
| 6 | +# |
| 7 | +# DAS Tool Copyright (c) 2017, The Regents of the University of California, through Lawrence |
| 8 | +# Berkeley National Laboratory (subject to receipt of any required approvals from the U.S. |
| 9 | +# Dept. of Energy). All rights reserved. |
| 10 | +# |
| 11 | +# If you have questions about your rights to use or distribute this software, please contact |
| 12 | +# Berkeley Lab's Innovation and Partnerships department at [email protected] referring to |
| 13 | +# "DAS Tool (2017-024)". |
| 14 | +# |
| 15 | +# NOTICE. This software was developed under funding from the U.S. Department of Energy. As |
| 16 | +# such, the U.S. Government has been granted for itself and others acting on its behalf a |
| 17 | +# paid-up, nonexclusive, irrevocable, worldwide license in the Software to reproduce, |
| 18 | +# prepare derivative works, and perform publicly and display publicly. The U.S. Government |
| 19 | +# is granted for itself and others acting on its behalf a paid-up, nonexclusive, |
| 20 | +# irrevocable, worldwide license in the Software to reproduce, prepare derivative works, |
| 21 | +# distribute copies to the public, perform publicly and display publicly, and to permit |
| 22 | +# others to do so. |
| 23 | +# |
| 24 | + |
| 25 | +SOURCE="${BASH_SOURCE[0]}" |
| 26 | +while [ -h "$SOURCE" ]; do |
| 27 | + DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )" |
| 28 | + SOURCE="$(readlink "$SOURCE")" |
| 29 | + [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" |
| 30 | +done |
| 31 | +DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )" |
| 32 | + |
| 33 | +thisdir=$(pwd) |
| 34 | + |
| 35 | +function display_version() { |
| 36 | + echo |
| 37 | + echo "DAS Tool version 1.0" |
| 38 | + echo |
| 39 | + exit 1 |
| 40 | +} |
| 41 | + |
| 42 | + |
| 43 | +function display_help() { |
| 44 | + echo " " |
| 45 | + echo "DAS Tool version 1.0" |
| 46 | + echo " " |
| 47 | + echo "Usage: DAS_Tool.sh -i methodA.scaffolds2bin,...,methodN.scaffolds2bin" |
| 48 | + echo " -l methodA,...,methodN -c contigs.fa -o myOutput" |
| 49 | + echo |
| 50 | + echo " -i, --bins Comma separated list of tab separated scaffolds to bin tables." |
| 51 | + echo " -c, --contigs Contigs in fasta format." |
| 52 | + echo " -o, --outputbasename Basename of output files." |
| 53 | + echo " -l, --labels Comma separated list of binning prediction names. (optional)" |
| 54 | + echo " --search_engine Engine used for single copy gene identification [blast/diamond/usearch]." |
| 55 | + echo " (default: usearch)" |
| 56 | + echo " --write_bin_evals Write evaluation for each input bin set [0/1]. (default: 1)" |
| 57 | + echo " --create_plots Create binning performance plots [0/1]. (default: 1)" |
| 58 | + echo " --write_bins Export bins as fasta files [0/1]. (default: 0)" |
| 59 | + echo " --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo)." |
| 60 | + echo " Gene prediction step will be skipped if given. (optional)" |
| 61 | + echo " --score_threshold Score threshold until selection algorithm will keep selecting bins [0..1]." |
| 62 | + echo " (default: 0.1)" |
| 63 | + echo " -t, --threads Number of threads to use. (default: 1)" |
| 64 | + echo " -v, --version Print version number and exit." |
| 65 | + echo " -h, --help Show this message." |
| 66 | + echo |
| 67 | + echo "Example 1: Run DAS Tool on binning predictions of MetaBAT, MaxBin, CONCOCT and tetraESOMs. Output files will start with the prefix DASToolRun1: " |
| 68 | + echo " ./DAS_Tool.sh -i sample_data/sample.human.gut_concoct_scaffolds2bin.tsv,sample_data/sample.human.gut_maxbin2_scaffolds2bin.tsv,sample_data/sample.human.gut_metabat_scaffolds2bin.tsv,sample_data/sample.human.gut_tetraESOM_scaffolds2bin.tsv -l concoct,maxbin,metabat,tetraESOM -c sample_data/sample.human.gut_contigs.fa -o sample_output/DASToolRun1" |
| 69 | + echo |
| 70 | + echo "Example 2: Run DAS Tool again with different parameters. Use the proteins predicted in Example 1 to skip the gene prediction step. Set the number of threads to 2 and score threshold to 0.6. Output files will start with the prefix DASToolRun2: " |
| 71 | + echo " ./DAS_Tool.sh -i sample_data/sample.human.gut_concoct_scaffolds2bin.tsv,sample_data/sample.human.gut_maxbin2_scaffolds2bin.tsv,sample_data/sample.human.gut_metabat_scaffolds2bin.tsv,sample_data/sample.human.gut_tetraESOM_scaffolds2bin.tsv -l concoct,maxbin,metabat,tetraESOM -c sample_data/sample.human.gut_contigs.fa -o sample_output/DASToolRun2 --threads 2 --score_threshold 0.6 --proteins sample_output/DASToolRun1_proteins.faa" |
| 72 | + echo " " |
| 73 | + echo " " |
| 74 | + echo "Please cite: https://doi.org/10.1101/107789" |
| 75 | + echo " " |
| 76 | + echo " " |
| 77 | + exit 1 |
| 78 | +} |
| 79 | + |
| 80 | +[ $# -eq 0 ] && { display_help ; exit 1; } |
| 81 | + |
| 82 | +binlabels="NULL" |
| 83 | +mappedreads="NULL" |
| 84 | +coverage_table="NULL" |
| 85 | +debug="FALSE" |
| 86 | +use_gc=1 |
| 87 | +use_N50=1 |
| 88 | +executed_prodigal=0 |
| 89 | +score_threshold=0.1 |
| 90 | +threads=1 |
| 91 | +write_bin_evals=1 |
| 92 | +create_plots=1 |
| 93 | +write_bins=0 |
| 94 | +search_engine="usearch" |
| 95 | + |
| 96 | +while [ "$1" != "" ]; do |
| 97 | + case $1 in |
| 98 | + -i | --bins ) shift |
| 99 | + scaffoldstobins=$1 |
| 100 | + ;; |
| 101 | + -l | --labels ) shift |
| 102 | + binlabels=$1 |
| 103 | + ;; |
| 104 | + -c | --contigs ) shift |
| 105 | + contigs=$1 |
| 106 | + ;; |
| 107 | + -o | --outputbasename ) shift |
| 108 | + outputbasename=$1 |
| 109 | + ;; |
| 110 | + --write_bin_evals ) shift |
| 111 | + write_bin_evals=$1 |
| 112 | + ;; |
| 113 | + --create_plots ) shift |
| 114 | + create_plots=$1 |
| 115 | + ;; |
| 116 | + --write_bins ) shift |
| 117 | + write_bins=$1 |
| 118 | + ;; |
| 119 | + --proteins ) shift |
| 120 | + proteins=$1 |
| 121 | + ;; |
| 122 | + --score_threshold ) shift |
| 123 | + score_threshold=$1 |
| 124 | + ;; |
| 125 | + --search_engine ) shift |
| 126 | + search_engine=$1 |
| 127 | + ;; |
| 128 | + -t | --threads ) shift |
| 129 | + threads=$1 |
| 130 | + ;; |
| 131 | + -v | --version ) shift |
| 132 | + display_version |
| 133 | + exit |
| 134 | + ;; |
| 135 | + --debug ) shift |
| 136 | + debug="TRUE" |
| 137 | + ;; |
| 138 | + -h | --help ) display_help |
| 139 | + exit |
| 140 | + ;; |
| 141 | + * ) display_help |
| 142 | + exit 1 |
| 143 | + esac |
| 144 | + shift |
| 145 | +done |
| 146 | + |
| 147 | + |
| 148 | +if [ "$write_bins" == "0" ] || [ "$write_bins" == "false" ] || [ "$write_bins" == "False" ] || [ "$write_bins" == "f" ]; then |
| 149 | +write_bins="FALSE" |
| 150 | +else |
| 151 | +write_bins="TRUE" |
| 152 | +fi |
| 153 | +if [ "$write_bin_evals" == "0" ] || [ "$write_bin_evals" == "false" ] || [ "$write_bin_evals" == "False" ] || [ "$write_bin_evals" == "f" ]; then |
| 154 | +write_bin_evals="FALSE" |
| 155 | +else |
| 156 | +write_bin_evals="TRUE" |
| 157 | +fi |
| 158 | +if [ "$create_plots" == "0" ] || [ "$create_plots" == "false" ] || [ "$create_plots" == "False" ] || [ "$create_plots" == "f" ]; then |
| 159 | +create_plots="FALSE" |
| 160 | +else |
| 161 | +create_plots="TRUE" |
| 162 | +fi |
| 163 | + |
| 164 | + |
| 165 | +# |
| 166 | +# 0.1 Load dependencies |
| 167 | +# |
| 168 | +PATH="$PATH:$DASTOOL_PULLSEQ" |
| 169 | +PATH="$PATH:$DASTOOL_PRODIGAL" |
| 170 | +PATH="$PATH:$DASTOOL_DIAMOND" |
| 171 | +PATH="$PATH:$DASTOOL_BLAST" |
| 172 | +PATH="$PATH:$DASTOOL_USEARCH" |
| 173 | + |
| 174 | + |
| 175 | +#check search engine |
| 176 | +if [ "$search_engine" == "diamond" ] || [ "$search_engine" == "DIAMOND" ] || [ "$search_engine" == "Diamond" ] || [ "$search_engine" == "d" ] || [ "$search_engine" == "D" ]; then |
| 177 | +command -v diamond >/dev/null 2>&1 || { echo >&2 "Can't find diamond. Aborting."; exit 1; } |
| 178 | +search_engine="diamond" |
| 179 | +fi |
| 180 | +if [ "$search_engine" == "blast" ] || [ "$search_engine" == "BLAST" ] || [ "$search_engine" == "Blast" ] || [ "$search_engine" == "b" ] || [ "$search_engine" == "B" ] || [ "$search_engine" == "blastp" ]; then |
| 181 | +command -v makeblastdb >/dev/null 2>&1 || { echo >&2 "Can't find makeblastdb. Aborting."; exit 1; } |
| 182 | +command -v blastp >/dev/null 2>&1 || { echo >&2 "Can't find blastp. Aborting."; exit 1; } |
| 183 | +search_engine="blast" |
| 184 | +fi |
| 185 | +if [ "$search_engine" == "usearch" ] || [ "$search_engine" == "USEARCH" ] || [ "$search_engine" == "UBLAST" ] || [ "$search_engine" == "Usearch" ] || [ "$search_engine" == "u" ] || [ "$search_engine" == "U" ] || [ "$search_engine" == "ublast" ] || [ "$search_engine" == "Ublast" ]; then |
| 186 | +command -v usearch >/dev/null 2>&1 || { echo >&2 "Can't find usearch. Aborting."; exit 1; } |
| 187 | +search_engine="usearch" |
| 188 | +fi |
| 189 | + |
| 190 | +# |
| 191 | +# 0.2 Check existence of files |
| 192 | +# |
| 193 | +if [ ! -f $contigs ] |
| 194 | +then |
| 195 | + echo contig file not found: $contigs |
| 196 | + echo stopping |
| 197 | + exit 1 |
| 198 | +fi |
| 199 | + |
| 200 | +if [ ! -z "$proteins" ] && [ ! -f $proteins ] |
| 201 | +then |
| 202 | + echo proteins file not found: $proteins |
| 203 | + echo stopping |
| 204 | + exit 1 |
| 205 | +fi |
| 206 | + |
| 207 | +if [ "$coverage_table" != "NULL" ] && [ ! -f $coverage_table ] |
| 208 | +then |
| 209 | + echo coverage table not found: $coverage_table |
| 210 | + echo stopping |
| 211 | + exit 1 |
| 212 | +fi |
| 213 | + |
| 214 | +if [ "$mappedreads" != "NULL" ] && [ ! -f $mappedreads ] |
| 215 | +then |
| 216 | + echo bam file not found: $mappedreads |
| 217 | + echo stopping |
| 218 | + exit 1 |
| 219 | +fi |
| 220 | + |
| 221 | +for i in $(echo $scaffoldstobins | tr "," " ") |
| 222 | +do |
| 223 | +if [ ! -f $i ]; |
| 224 | +then |
| 225 | + echo "scaffolds2bin file not found: $i " |
| 226 | + exit 1 |
| 227 | +fi |
| 228 | +done |
| 229 | + |
| 230 | + |
| 231 | + |
| 232 | +# |
| 233 | +# 1. Run Prodigal |
| 234 | +# |
| 235 | +if [ -z "$proteins" ] |
| 236 | +then |
| 237 | + |
| 238 | + #check if dependencies for gene prediction are installed |
| 239 | + command -v pullseq >/dev/null 2>&1 || { echo >&2 "Can't find pullseq. Aborting."; exit 1; } |
| 240 | + command -v prodigal >/dev/null 2>&1 || { echo >&2 "Can't find prodigal. Aborting."; exit 1; } |
| 241 | + |
| 242 | + proteins=$outputbasename\_proteins.faa |
| 243 | + |
| 244 | + echo "predicting genes using prodigal" |
| 245 | + grep ">" $contigs | perl -pe 's/>//g;' | shuf > dastoolthreadstmpheader |
| 246 | + dastoolthreadstmpheader=dastoolthreadstmpheader |
| 247 | + total_lines=$(wc -l <${dastoolthreadstmpheader}) |
| 248 | + ((lines_per_file = (total_lines + threads - 1) / threads)) |
| 249 | + split --numeric-suffixes --lines=${lines_per_file} ${dastoolthreadstmpheader} dastoolthreadstmp. |
| 250 | + rm dastoolthreadstmpheader |
| 251 | + |
| 252 | + for i in dastoolthreadstmp.* |
| 253 | + do |
| 254 | + pullseq --input $contigs --names $i > $i\_contigs.fa |
| 255 | + prodigal -a $i\_partialprot.faa -i $i\_contigs.fa -p meta -m -q > /dev/null 2>&1 & |
| 256 | + done |
| 257 | + wait |
| 258 | + cat *_partialprot.faa > $proteins |
| 259 | + rm dastoolthreadstmp.* |
| 260 | + executed_prodigal=1 |
| 261 | +elif [ ! -f $proteins ] |
| 262 | +then |
| 263 | + echo protein file not found: $proteins |
| 264 | + echo stopping |
| 265 | + exit 1 |
| 266 | +fi |
| 267 | + |
| 268 | + |
| 269 | + |
| 270 | +# |
| 271 | +# 2. Predict Single Copy Genes |
| 272 | +# |
| 273 | +bscg=$proteins\.bacteria.scg |
| 274 | +ascg=$proteins\.archaea.scg |
| 275 | +if [ ! -f $bscg ] || [ ! -f $ascg ] || [ $executed_prodigal == 1 ]; then |
| 276 | +echo "identifying single copy genes using $search_engine" |
| 277 | +#predict bacterial SCGs |
| 278 | +ruby $DIR\/src/scg_blank_$search_engine\.rb $search_engine $proteins $DIR\/db/bac.all.faa $DIR\/db/bac.scg.faa $DIR\/db/bac.scg.lookup $threads > /dev/null 2>&1 |
| 279 | +mv $proteins\.scg $bscg |
| 280 | +#predict archaeal SCGs |
| 281 | +ruby $DIR\/src/scg_blank_$search_engine\.rb $search_engine $proteins $DIR\/db/arc.all.faa $DIR\/db/arc.scg.faa $DIR\/db/arc.scg.lookup $threads > /dev/null 2>&1 |
| 282 | +mv $proteins\.scg $ascg |
| 283 | +rm $proteins\.findSCG.b6 $proteins\.scg.candidates.faa $proteins\.all.b6 |
| 284 | +else |
| 285 | + echo found predicted single copy genes: |
| 286 | + echo " " $bscg |
| 287 | + echo " " $ascg |
| 288 | + echo skipping single copy gene identification |
| 289 | +fi |
| 290 | + |
| 291 | + |
| 292 | + |
| 293 | +# |
| 294 | +# 3. Calculate Contig Length |
| 295 | +# |
| 296 | +length_table=$outputbasename\.seqlength |
| 297 | +awk '/^>/ {if (x){print x}; print ;x=0;next; } {x += length($0)}END{print x}' $contigs | sed -e ':a' -e 'N' -e '$!ba' -e 's/\n\([^>]\)/?\1/g' | tr '?' '\t' | sed 's/>//g' > $length_table |
| 298 | + |
| 299 | + |
| 300 | + |
| 301 | +# |
| 302 | +# 4. Run DAS Tool |
| 303 | +# |
| 304 | +cd $DIR |
| 305 | +Rscript $DIR\/src/DAS_Tool.R $scaffoldstobins $binlabels $contigs $bscg $ascg $outputbasename $score_threshold $threads $thisdir $length_table $write_bin_evals $create_plots $debug |
| 306 | +cd $thisdir |
| 307 | + |
| 308 | + |
| 309 | + |
| 310 | +# |
| 311 | +# 5. Extract Bins |
| 312 | +# |
| 313 | +if [ "$write_bins" == "TRUE" ]; then |
| 314 | +scaf2bin=$outputbasename\_DASTool_scaffolds2bin.txt |
| 315 | +if test -f $scaf2bin; then |
| 316 | + |
| 317 | +bin_folder=$outputbasename\_DASTool_bins |
| 318 | +if [ -d "$bin_folder" ]; then |
| 319 | + rm -r $bin_folder |
| 320 | +fi |
| 321 | +mkdir $bin_folder |
| 322 | + |
| 323 | +echo extracting bins to $bin_folder |
| 324 | +while read scaffold bin |
| 325 | +do |
| 326 | +echo $scaffold >> $bin_folder\/$bin\.ctgtmp |
| 327 | +done < $scaf2bin |
| 328 | + |
| 329 | +for i in $bin_folder\/*.ctgtmp |
| 330 | +do |
| 331 | +bin_id=$(echo $i | perl -pe 's/\.ctgtmp//g') |
| 332 | +pullseq -i $contigs -n $i > $bin_id\.contigs.fa |
| 333 | +rm $i |
| 334 | +done |
| 335 | +fi |
| 336 | +fi |
0 commit comments