-
Notifications
You must be signed in to change notification settings - Fork 1
/
sraX
executable file
·334 lines (271 loc) · 11.9 KB
/
sraX
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Long;
use FindBin;
use lib "$FindBin::RealBin";
use Cwd qw(abs_path);
use sraXlib::Functions;
use sraXlib::Dir;
use sraXlib::DB;
use sraXlib::Blast;
use sraXlib::Parse;
use sraXlib::SNP;
my $f_log = "";
my $err_msg = "";
my $d_gnm_abs = "";
my $d_out_abs = "";
my %opt = (
'input' => undef,
'output' => undef,
'seqal' => "dblastx",
'msa' => "muscle",
'eval' => 1e-05,
'id' => 85,
'aln_cov' => 60,
'dbsearch' => "basic",
'threads' => 6,
'user_sq' => undef,
'version' => undef,
'debug' => undef,
'help' => undef,
);
if (@ARGV == 0){
$err_msg = "\nFor running 'sraX', it's mandatory to provide, at least, a directory containing the genome(s) that you want to analyze.\n";
$err_msg .= "sraX execution is stopped and the resistome analysis is aborted now.\n";
print STDERR $err_msg;
die usage();
}
GetOptions(
'input|i=s' => \$opt{input},
'output|o=s' => \$opt{output},
'seqal|s=s' => \$opt{seqal},
'msa|a=s' => \$opt{msa},
'eval|e=f' => \$opt{eval},
'id=f' => \$opt{id},
'aln_cov|c=f' => \$opt{aln_cov},
'dbsearch|db=s' => \$opt{dbsearch},
'user_sq|u=s' => \$opt{user_sq},
'threads|t=i' => \$opt{threads},
'version|v' => \$opt{version},
'debug|d' => \$opt{debug},
'help|h' => \$opt{help},
);
die usage() if $opt{help};
if($opt{version}){
print << "VERSION";
sraX - Systematic Resistome Analysis
version: sraXv1.5
Copyright 2020 Leonardo G. Panunzi
VERSION
exit;
}
unless ($opt{input}){
$err_msg = "\nFor running 'sraX', it's mandatory to provide, at least, a directory containing the genome(s) that you want to analyze.\n";
$err_msg .= "sraX execution is stopped and the resistome analysis is aborted now.\n";
print STDERR $err_msg;
die usage();
}else{
$d_gnm_abs = abs_path($opt{input});
my $d_pres = sraXlib::Functions::check_dir($d_gnm_abs);
unless($d_pres == 1){
$err_msg = "\n[Error]: The given directory '$opt{input}' is not present\n";
$err_msg .= "Please, check its existence or given name.\n";
$err_msg .= "sraX execution is stopped and the resistome analysis is aborted now.\n";
print STDERR $err_msg;
die usage();
}else{
print "-" x 66 ."\n";
print "The 'sraX' analysis started at:\t".sraXlib::Functions::print_time."\n";
print "-" x 66 ."\n\n";
$f_log = "-" x 66 ."\n";
$f_log .= "The 'sraX' analysis started at:\t".sraXlib::Functions::print_time."\n";
$f_log .= "-" x 66 ."\n\n";
}
}
my $t_start_time_cp = sraXlib::Functions::running_time;
if($opt{seqal} ne 'dblastx' && $opt{seqal} ne 'blastx'){
$f_log .= sraXlib::Functions::print_e('s', $opt{seqal});
print STDERR sraXlib::Functions::print_e('s', $opt{seqal}) if $opt{debug};
die usage();
}else{
$f_log .= sraXlib::Functions::print_ne('s', $opt{seqal});
print STDERR sraXlib::Functions::print_ne('s', $opt{seqal}) if $opt{debug};
}
if($opt{msa} ne 'muscle' && $opt{msa} ne 'clustalo' && $opt{msa} ne 'mafft'){
$f_log .= sraXlib::Functions::print_e('a', $opt{msa});
print STDERR sraXlib::Functions::print_e('a', $opt{msa}) if $opt{debug};
die usage();
}else{
$f_log .= sraXlib::Functions::print_ne('a', $opt{msa});
print STDERR sraXlib::Functions::print_ne('a', $opt{msa}) if $opt{debug};
}
if($opt{dbsearch} ne 'basic' && $opt{dbsearch} ne 'ext' && $opt{dbsearch} ne 'extensive'){
$f_log .= sraXlib::Functions::print_e('db', $opt{dbsearch});
print STDERR sraXlib::Functions::print_e('db', $opt{dbsearch}) if $opt{debug};
die usage();
}else{
$f_log .= sraXlib::Functions::print_ne('db', $opt{dbsearch});
print STDERR sraXlib::Functions::print_ne('db', $opt{dbsearch}) if $opt{debug};
}
if( $opt{eval} < 0){
$f_log .= sraXlib::Functions::print_e('e', $opt{eval});
print STDERR sraXlib::Functions::print_e('e', $opt{eval}) if $opt{debug};
die usage();
}else{
$f_log .= sraXlib::Functions::print_ne('e', $opt{eval});
print STDERR sraXlib::Functions::print_ne('e', $opt{eval}) if $opt{debug};
}
if( $opt{id} > 100 || $opt{id} < 0){
$f_log .= sraXlib::Functions::print_e('id', $opt{id});
print STDERR sraXlib::Functions::print_e('id', $opt{id}) if $opt{debug};
die usage();
}else{
$f_log .= sraXlib::Functions::print_ne('id', $opt{id});
print STDERR sraXlib::Functions::print_ne('id', $opt{id}) if $opt{debug};
}
if( $opt{aln_cov} > 100 || $opt{aln_cov} < 0){
$f_log .= sraXlib::Functions::print_e('c', $opt{aln_cov});
print STDERR sraXlib::Functions::print_e('c', $opt{aln_cov}) if $opt{debug};
die usage();
}else{
$f_log .= sraXlib::Functions::print_ne('c', $opt{aln_cov});
print STDERR sraXlib::Functions::print_ne('c', $opt{aln_cov}) if $opt{debug};
}
if( $opt{threads} > 12 || $opt{threads} < 4){
$f_log .= sraXlib::Functions::print_e('t', $opt{threads});
print STDERR sraXlib::Functions::print_e('t', $opt{threads}) if $opt{debug};
die usage();
}else{
$f_log .= sraXlib::Functions::print_ne('t', $opt{threads});
print STDERR sraXlib::Functions::print_ne('t', $opt{threads}) if $opt{debug};
}
unless ($opt{output}){
$opt{output} = $opt{input}."_sraX_".$opt{id}."_".$opt{aln_cov}."_".$opt{seqal};
$d_out_abs = abs_path($opt{output});
print STDERR sraXlib::Functions::print_ne('o', $opt{output}) if $opt{debug};
$f_log .= sraXlib::Functions::print_ne('o', $opt{output});
my $d_pres = sraXlib::Functions::check_dir($d_out_abs);
unless($d_pres == 1){
print "\nThe results will be placed in the following directory: '$opt{output}'.\n" if $opt{debug};
$f_log .= "\nThe results will be placed in the following directory: '$opt{output}'.\n";
}else{
$err_msg = "\n[Warn]: The directory '$opt{output}' is already present, implying that the script has already been run.\n";
$err_msg .= "[Warn]: This directory and all previous results are going to be overwritten now.\n";
print STDERR $err_msg if $opt{debug};
$f_log .= "\n[Warn]: The directory '$opt{output}' is already present, implying that the script has already been run.\n";
$f_log .= "[Warn]: This directory and all previous results are going to be overwritten now.\n";
system("rm -r $d_out_abs");
}
}else{
$d_out_abs = abs_path($opt{output});
my $d_pres = sraXlib::Functions::check_dir($d_out_abs);
unless($d_pres == 1){
print "\nThe results will be placed in the following directory: '$opt{output}'.\n" if $opt{debug};
$f_log .= "\nThe results will be placed in the following directory: '$opt{output}'.\n";
}else{
$err_msg = "\n[Warn]: The directory '$opt{output}' is already present, implying that the script has already been run.\n";
$err_msg .= "[Warn]: This directory and all previous results are going to be overwritten now.\n";
print STDERR $err_msg if $opt{debug};
$f_log .= "\n[Warn]: The directory '$opt{output}' is already present, implying that the script has already been run.\n";
$f_log .= "[Warn]: This directory and all previous results are going to be overwritten now.\n";
system("rm -r $d_out_abs");
}
}
sraXlib::Dir::build_dir_str($d_out_abs);
open LOG, ">>$d_out_abs/Log/sraX_log.txt" || die sraXlib::Functions::print_errf("$d_out_abs/Log/sraX_log.txt","o");
print LOG "$f_log\n";
unless ($opt{user_sq}){
sraXlib::DB::get_publ_db($d_out_abs,"nsq",$opt{dbsearch});
}else{
sraXlib::DB::get_publ_db($d_out_abs,"usq",$opt{dbsearch});
sraXlib::DB::get_user_db($d_out_abs,$opt{user_sq});
}
sraXlib::Blast::set_prmt($d_gnm_abs,$d_out_abs,$opt{seqal},$opt{eval},$opt{id},$opt{aln_cov},$opt{threads});
sraXlib::Parse::f_parse($d_gnm_abs,$d_out_abs,$opt{id},$opt{aln_cov});
sraXlib::SNP::sq_variants($d_out_abs,$opt{msa});
my $t_stop_time_cp = sraXlib::Functions::running_time;
print "\n\tThe completion of 'sraX' analysis took ";
printf ("%.2f ", $t_stop_time_cp - $t_start_time_cp);
print " wallclock secs\n\n";
print "-" x 74 ."\n";
print "The 'sraX' analysis finished at:\t".sraXlib::Functions::print_time."\n";
print "-" x 74 ."\n\n";
print LOG "\n\tThe completion of 'sraX' analysis took ";
printf LOG ("%.2f ", $t_stop_time_cp - $t_start_time_cp);
print LOG " wallclock secs\n\n";
print LOG "-" x 74 ."\n";
print LOG "The 'sraX' analysis finished at:\t".sraXlib::Functions::print_time."\n";
print LOG "-" x 74 ."\n\n";
close LOG;
sub usage{
warn <<EOF
sraX v1.5 | by Leonardo G. Panunzi <lgpanunzi\@gmail.com>
Licensed under the GNU GPL <https://www.gnu.org/licenses/gpl.txt>
https://github.com/lgpdevtools/srax
USAGE:
sraX -i [input folder_name (with genome file(s))] [options]
SYNOPSIS:
sraX is designed to read assembled sequence files in FASTA format and systematically detect the
presence of antimicrobial resistance genes (ARGs). The complete resistome analysis is effectively
accomplished by running a single command. Under default parameters, only a mandatory folder
enclosing the selected genome FASTA files is required. In addition, the following default data
repositories & software dependences are preferred: CARD database (ARG repository), DIAMOND
(sequence aligner), MUSCLE (multiple-sequence aligner, required for SNP detection) and R
environment (visualization plots).
sraX operates in four main stages:
I) Sequence data acquisition: The CARD database is downloaded, while its metadata is further
parsed for compiling a local antimicrobial resistance database (AMR DB).
II) Sequence homology search: dblastx (DIAMOND blastx) or blastx (NCBI blastx) algorithms are
employed for querying the genomes against the previously compiled AMR DB.
III) SNP analysis: Reference (from AMR DB) and corresponding homolog (from assembled genomes)
sequences are gathered into multiple-sequence alignments (MSA) for identifying the SNP positions.
IV) Output summary files and visualization: The R software is employed for producing specific
plots that complement the resulting summary tables, which on the whole are visualized in HTML format
files.
--------------------
- Running commands -
--------------------
Mandatory:
----------
-i|input Input directory [/path/to/input_dir] containing the input file(s), which
must be in FASTA format and consisting of individual assembled genome sequences.
Optional:
---------
-o|output Directory to store obtained results [/path/to/output_dir]. While not
provided, the following default name will be taken:
'input_directory'_'sraX'_'id'_'aln_cov'_'seqal'
Example:
--------
Input directory: 'Test'
Options: -id 85; -c 95; -p dblastx
Output directory: 'Test_sraX_85_95_dblastx'
-s|seqal The preferred algorithm for aligning the assembled genome(s) to a locally
compiled AMR DB. The possible choices are: 'dblastx' (DIAMOND blastx) or 'blastx'
(NCBI blastx). In any case, the process is parallelized (up to 100 genome files are
run simultaneously) for reducing computing times. [string] Default: dblastx
-a|msa The preferred algorithm for producing the alignment of clustered homologous
sequences (multiple-sequence files). The possible choices are: 'muscle', 'clustalo'
or 'mafft'. [string] Default: muscle
Note: The accuracy and computing times are both dependent on the selected algorithm.
-e|eval Minimum evalue cut-off to filter false positives. [number] Default: 1e-05
-id Minimum identity cut-off to filter false positives. [number] Default: 85
-c|aln_cov Minimum length of the query which must align to the reference sequence.
[number] Default: 60
-db|dbsearch The level of the ARG search, on account of the number and type of employed AMR DB.
The possible choices are: 'basic' or 'ext'. The 'basic' option only applies 'CARD',
while the 'ext' option utilizes as well the 'ARGminer' (compilation of multiple AMR
DBs) and 'BACMET' (biocides and metal resistance) repositories. [string] Default:
basic
Note: In operational terms, the extensive search ('ext' option) takes much longer
computing times.
-u|user_sq Customary AMR DB provided by the user. The sequences must be in FASTA format.
-t|threads Number of threads when running sraX. [number] Default: 6
-h|help Displays this help information and exits.
-v|version Displays version information and exits.
-d|debug Verbose output (for debugging).
'sraX' was last modified: 05th February 2020
EOF
;
exit;
}