Skip to content

Commit 294195f

Browse files
Fix to percent identity and alignment length filtering when using BLASTP.
1 parent d3d4b20 commit 294195f

File tree

5 files changed

+42
-13
lines changed

5 files changed

+42
-13
lines changed

bin/comparem

+3
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ if __name__ == '__main__':
134134
similarity_parser.add_argument('-a', '--per_aln_len', type=float, default=70.0, help="minimum percent coverage of query sequence for reporting an alignment")
135135
similarity_parser.add_argument('-x', '--file_ext', default='faa', help="extension of files to process")
136136
similarity_parser.add_argument('--blastp', action="store_true", default=False, help="use Blastp-fast instead of DIAMOND")
137+
similarity_parser.add_argument('--sensitive', action="store_true", default=False, help="use sensitive mode of DIAMOND")
137138
similarity_parser.add_argument('--keep_headers', action="store_true", default=False, help="indicates FASTA headers already have the format <genome_id>~<gene_id>")
138139
similarity_parser.add_argument('--tmp_dir', action=ChangeTempAction, default=tempfile.gettempdir(), help="specify alternative directory for temporary files")
139140
similarity_parser.add_argument('-c', '--cpus', help='number of CPUs to use', type=int, default=1)
@@ -184,6 +185,7 @@ if __name__ == '__main__':
184185
aai_wf_parser.add_argument('--proteins', action="store_true", default=False, help="indicates the input files contain protein sequences")
185186
aai_wf_parser.add_argument('--force_table', type=int, default=None, help="force use of specific translation table")
186187
aai_wf_parser.add_argument('--blastp', action="store_true", default=False, help="use blastp instead of diamond")
188+
aai_wf_parser.add_argument('--sensitive', action="store_true", default=False, help="use sensitive mode of DIAMOND")
187189
aai_wf_parser.add_argument('--keep_headers', action="store_true", default=False, help="indicates FASTA headers already have the format <genome_id>~<gene_id>")
188190
aai_wf_parser.add_argument('--keep_rbhs', help="create file with reciprocal best hits", action='store_true')
189191
aai_wf_parser.add_argument('--tmp_dir', action=ChangeTempAction, default=tempfile.gettempdir(), help="specify alternative directory for temporary files")
@@ -206,6 +208,7 @@ if __name__ == '__main__':
206208
classify_wf_parser.add_argument('--proteins', action="store_true", default=False, help="indicates the input files contain protein sequences")
207209
classify_wf_parser.add_argument('--force_table', type=int, default=None, help="force use of specific translation table")
208210
classify_wf_parser.add_argument('--blastp', action="store_true", default=False, help="use blastp instead of diamond")
211+
classify_wf_parser.add_argument('--sensitive', action="store_true", default=False, help="use sensitive mode of DIAMOND")
209212
classify_wf_parser.add_argument('--keep_headers', action="store_true", default=False, help="indicates FASTA headers already have the format <genome_id>~<gene_id>")
210213
classify_wf_parser.add_argument('--keep_rbhs', help="create file with reciprocal best hits", action='store_true')
211214
classify_wf_parser.add_argument('--tmp_dir', action=ChangeTempAction, default=tempfile.gettempdir(), help="specify alternative directory for temporary files")

comparem/VERSION

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.0.21
1+
0.0.23

comparem/aai_calculator.py

+5
Original file line numberDiff line numberDiff line change
@@ -142,11 +142,16 @@ def _valid_hits(self, hit_table_stream,
142142
hit = hit_table_stream.readline().split('\t')
143143

144144
perc_iden = float(hit[4])
145+
if perc_iden < per_identity_threshold:
146+
continue
147+
145148
evalue = float(hit[12])
146149

147150
query_id = hit[0] + '~' + hit[1]
148151
query_coverage = int(hit[9]) - int(hit[8]) + 1
149152
per_aln_len = query_coverage * 100.0 / self.gene_lengths[query_id]
153+
if per_aln_len < per_aln_len_threshold:
154+
continue
150155

151156
target_genome = hit[2]
152157
target_id = target_genome + '~' + hit[3]

comparem/main.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -138,13 +138,18 @@ def ani(self, options):
138138

139139
def call_genes(self, options):
140140
"""Call genes command"""
141-
141+
142142
make_sure_path_exists(options.output_dir)
143143

144144
genome_files = self._input_files(options.input_genomes, options.file_ext)
145145

146146
prodigal = Prodigal(options.cpus, not options.silent)
147-
summary_stats = prodigal.run(genome_files, options.output_dir, False, options.force_table, False)
147+
summary_stats = prodigal.run(genome_files,
148+
options.output_dir,
149+
called_genes=False,
150+
translation_table=options.force_table,
151+
meta=False,
152+
closed_ends=True)
148153

149154
# write gene calling summary
150155
fout = open(os.path.join(options.output_dir, 'call_genes.summary.tsv'), 'w')
@@ -175,6 +180,7 @@ def similarity(self, options):
175180
True,
176181
options.tmp_dir,
177182
options.blastp,
183+
options.sensitive,
178184
options.keep_headers,
179185
options.output_dir)
180186

comparem/similarity_search.py

+25-10
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ def _run_self_diamond(self, query_gene_file,
148148
per_identity,
149149
per_aln_len,
150150
max_hits,
151+
sensitive,
151152
high_mem,
152153
tmp_dir,
153154
output_dir):
@@ -193,7 +194,8 @@ def _run_self_diamond(self, query_gene_file,
193194
evalue,
194195
per_identity,
195196
per_aln_len,
196-
max_hits,
197+
max_hits,
198+
sensitive,
197199
tmp_hits_table.name,
198200
'standard',
199201
tmp_dir,
@@ -205,7 +207,8 @@ def _run_self_diamond(self, query_gene_file,
205207
evalue,
206208
per_identity,
207209
per_aln_len,
208-
max_hits,
210+
max_hits,
211+
sensitive,
209212
tmp_hits_table.name,
210213
'standard',
211214
tmp_dir)
@@ -221,6 +224,7 @@ def _run_reciprocal_diamond(self, query_gene_file,
221224
per_identity,
222225
per_aln_len,
223226
max_hits,
227+
sensitive,
224228
high_mem,
225229
tmp_dir,
226230
output_dir):
@@ -272,9 +276,10 @@ def _run_reciprocal_diamond(self, query_gene_file,
272276
evalue,
273277
per_identity,
274278
per_aln_len,
275-
max_hits,
279+
max_hits,
280+
sensitive,
276281
tmp_query_hits_table.name,
277-
'tab',
282+
'standard',
278283
tmp_dir,
279284
chunk_size=1,
280285
block_size=8)
@@ -284,9 +289,10 @@ def _run_reciprocal_diamond(self, query_gene_file,
284289
evalue,
285290
per_identity,
286291
per_aln_len,
287-
max_hits,
292+
max_hits,
293+
sensitive,
288294
tmp_query_hits_table.name,
289-
'tab',
295+
'standard',
290296
tmp_dir)
291297

292298
# get target genes hit by one or more query proteins
@@ -321,7 +327,8 @@ def _run_reciprocal_diamond(self, query_gene_file,
321327
evalue,
322328
per_identity,
323329
per_aln_len,
324-
max_hits,
330+
max_hits,
331+
sensitive,
325332
tmp_target_hits_table.name,
326333
'standard',
327334
tmp_dir,
@@ -333,7 +340,8 @@ def _run_reciprocal_diamond(self, query_gene_file,
333340
evalue,
334341
per_identity,
335342
per_aln_len,
336-
max_hits,
343+
max_hits,
344+
sensitive,
337345
tmp_target_hits_table.name,
338346
'standard',
339347
tmp_dir)
@@ -352,6 +360,7 @@ def run(self, query_gene_files,
352360
high_mem,
353361
tmp_dir,
354362
blastp,
363+
sensitive,
355364
keep_headers,
356365
output_dir):
357366
"""Perform similarity search of query genes against target genes.
@@ -371,7 +380,9 @@ def run(self, query_gene_files,
371380
tmp_dir : str
372381
Directory to store temporary files.
373382
blastp : boolean
374-
If True blasp-fast is used instead of DIAMOND.
383+
If True, blasp-fast is used instead of DIAMOND.
384+
sensitive : boolean
385+
If True, the sensitive mode of DIAMOND is used.
375386
keep_headers : boolean
376387
If True, indicates FASTA headers already have the format <genome_id>~<gene_id>.
377388
output_dir : str
@@ -407,7 +418,9 @@ def run(self, query_gene_files,
407418
tmp_dir,
408419
output_dir)
409420
else:
410-
self._run_reciprocal_blasp(query_gene_file,
421+
self.logger.info('NOT YET IMPLEMENTED!')
422+
sys.exit()
423+
self._run_reciprocal_blastp(query_gene_file,
411424
target_gene_file,
412425
evalue,
413426
per_identity,
@@ -423,6 +436,7 @@ def run(self, query_gene_files,
423436
per_identity,
424437
per_aln_len,
425438
len(target_gene_files) * 10,
439+
sensitive,
426440
high_mem,
427441
tmp_dir,
428442
output_dir)
@@ -435,6 +449,7 @@ def run(self, query_gene_files,
435449
per_identity,
436450
per_aln_len,
437451
len(target_gene_files) * 10,
452+
sensitive,
438453
high_mem,
439454
tmp_dir,
440455
output_dir)

0 commit comments

Comments
 (0)