Fix to percent identity and alignment length filtering when using BLASTP.

donovan-h-parks · donovan-h-parks · commit 294195f2fc6e · 2017-08-05T03:08:45.000+10:00
diff --git a/bin/comparem b/bin/comparem
@@ -134,6 +134,7 @@ if __name__ == '__main__':
     similarity_parser.add_argument('-a', '--per_aln_len', type=float, default=70.0, help="minimum percent coverage of query sequence for reporting an alignment")
     similarity_parser.add_argument('-x', '--file_ext', default='faa', help="extension of files to process")
     similarity_parser.add_argument('--blastp', action="store_true", default=False, help="use Blastp-fast instead of DIAMOND")
+    similarity_parser.add_argument('--sensitive', action="store_true", default=False, help="use sensitive mode of DIAMOND")
     similarity_parser.add_argument('--keep_headers', action="store_true", default=False, help="indicates FASTA headers already have the format <genome_id>~<gene_id>")
     similarity_parser.add_argument('--tmp_dir', action=ChangeTempAction, default=tempfile.gettempdir(), help="specify alternative directory for temporary files")
     similarity_parser.add_argument('-c', '--cpus', help='number of CPUs to use', type=int, default=1)
@@ -184,6 +185,7 @@ if __name__ == '__main__':
     aai_wf_parser.add_argument('--proteins', action="store_true", default=False, help="indicates the input files contain protein sequences")
     aai_wf_parser.add_argument('--force_table', type=int, default=None, help="force use of specific translation table")
     aai_wf_parser.add_argument('--blastp', action="store_true", default=False, help="use blastp instead of diamond")
+    aai_wf_parser.add_argument('--sensitive', action="store_true", default=False, help="use sensitive mode of DIAMOND")
     aai_wf_parser.add_argument('--keep_headers', action="store_true", default=False, help="indicates FASTA headers already have the format <genome_id>~<gene_id>")
     aai_wf_parser.add_argument('--keep_rbhs', help="create file with reciprocal best hits", action='store_true')
     aai_wf_parser.add_argument('--tmp_dir', action=ChangeTempAction, default=tempfile.gettempdir(), help="specify alternative directory for temporary files")
@@ -206,6 +208,7 @@ if __name__ == '__main__':
     classify_wf_parser.add_argument('--proteins', action="store_true", default=False, help="indicates the input files contain protein sequences")
     classify_wf_parser.add_argument('--force_table', type=int, default=None, help="force use of specific translation table")
     classify_wf_parser.add_argument('--blastp', action="store_true", default=False, help="use blastp instead of diamond")
+    classify_wf_parser.add_argument('--sensitive', action="store_true", default=False, help="use sensitive mode of DIAMOND")
     classify_wf_parser.add_argument('--keep_headers', action="store_true", default=False, help="indicates FASTA headers already have the format <genome_id>~<gene_id>")
     classify_wf_parser.add_argument('--keep_rbhs', help="create file with reciprocal best hits", action='store_true')
     classify_wf_parser.add_argument('--tmp_dir', action=ChangeTempAction, default=tempfile.gettempdir(), help="specify alternative directory for temporary files")
diff --git a/comparem/VERSION b/comparem/VERSION
@@ -1 +1 @@
-0.0.21
+0.0.23
diff --git a/comparem/aai_calculator.py b/comparem/aai_calculator.py
@@ -142,11 +142,16 @@ def _valid_hits(self, hit_table_stream,
             hit = hit_table_stream.readline().split('\t')
 
             perc_iden = float(hit[4])
+            if perc_iden < per_identity_threshold:
+                continue
+                
             evalue = float(hit[12])
 
             query_id = hit[0] + '~' + hit[1]
             query_coverage = int(hit[9]) - int(hit[8]) + 1
             per_aln_len = query_coverage * 100.0 / self.gene_lengths[query_id]
+            if per_aln_len < per_aln_len_threshold:
+                continue
 
             target_genome = hit[2]
             target_id = target_genome + '~' + hit[3]
diff --git a/comparem/main.py b/comparem/main.py
@@ -138,13 +138,18 @@ def ani(self, options):
 
     def call_genes(self, options):
         """Call genes command"""
-        
+
         make_sure_path_exists(options.output_dir)
         
         genome_files = self._input_files(options.input_genomes, options.file_ext)
 
         prodigal = Prodigal(options.cpus, not options.silent)
-        summary_stats = prodigal.run(genome_files, options.output_dir, False, options.force_table, False)
+        summary_stats = prodigal.run(genome_files, 
+                                        options.output_dir, 
+                                        called_genes=False, 
+                                        translation_table=options.force_table, 
+                                        meta=False,
+                                        closed_ends=True)
 
         # write gene calling summary
         fout = open(os.path.join(options.output_dir, 'call_genes.summary.tsv'), 'w')
@@ -175,6 +180,7 @@ def similarity(self, options):
                 True,
                 options.tmp_dir,
                 options.blastp,
+                options.sensitive,
                 options.keep_headers,
                 options.output_dir)
 
diff --git a/comparem/similarity_search.py b/comparem/similarity_search.py
@@ -148,6 +148,7 @@ def _run_self_diamond(self, query_gene_file,
                                 per_identity, 
                                 per_aln_len,
                                 max_hits,
+                                sensitive,
                                 high_mem,
                                 tmp_dir,
                                 output_dir):
@@ -193,7 +194,8 @@ def _run_self_diamond(self, query_gene_file,
                             evalue, 
                             per_identity, 
                             per_aln_len, 
-                            max_hits, 
+                            max_hits,
+                            sensitive,
                             tmp_hits_table.name, 
                             'standard', 
                             tmp_dir, 
@@ -205,7 +207,8 @@ def _run_self_diamond(self, query_gene_file,
                             evalue, 
                             per_identity, 
                             per_aln_len, 
-                            max_hits, 
+                            max_hits,
+                            sensitive,
                             tmp_hits_table.name, 
                             'standard', 
                             tmp_dir)
@@ -221,6 +224,7 @@ def _run_reciprocal_diamond(self, query_gene_file,
                                         per_identity, 
                                         per_aln_len,
                                         max_hits,
+                                        sensitive,
                                         high_mem,
                                         tmp_dir,
                                         output_dir):
@@ -272,9 +276,10 @@ def _run_reciprocal_diamond(self, query_gene_file,
                             evalue, 
                             per_identity, 
                             per_aln_len, 
-                            max_hits, 
+                            max_hits,
+                            sensitive,
                             tmp_query_hits_table.name, 
-                            'tab', 
+                            'standard', 
                             tmp_dir, 
                             chunk_size=1, 
                             block_size=8)
@@ -284,9 +289,10 @@ def _run_reciprocal_diamond(self, query_gene_file,
                             evalue, 
                             per_identity, 
                             per_aln_len, 
-                            max_hits, 
+                            max_hits,
+                            sensitive,
                             tmp_query_hits_table.name, 
-                            'tab', 
+                            'standard', 
                             tmp_dir)
                 
         # get target genes hit by one or more query proteins
@@ -321,7 +327,8 @@ def _run_reciprocal_diamond(self, query_gene_file,
                             evalue, 
                             per_identity, 
                             per_aln_len, 
-                            max_hits, 
+                            max_hits,
+                            sensitive,
                             tmp_target_hits_table.name, 
                             'standard', 
                             tmp_dir, 
@@ -333,7 +340,8 @@ def _run_reciprocal_diamond(self, query_gene_file,
                             evalue, 
                             per_identity, 
                             per_aln_len, 
-                            max_hits, 
+                            max_hits,
+                            sensitive,
                             tmp_target_hits_table.name, 
                             'standard', 
                             tmp_dir)
@@ -352,6 +360,7 @@ def run(self, query_gene_files,
                     high_mem,
                     tmp_dir,
                     blastp,
+                    sensitive,
                     keep_headers,
                     output_dir):
         """Perform similarity search of query genes against target genes.
@@ -371,7 +380,9 @@ def run(self, query_gene_files,
         tmp_dir : str
             Directory to store temporary files.
         blastp : boolean
-            If True blasp-fast is used instead of DIAMOND.
+            If True, blasp-fast is used instead of DIAMOND.
+        sensitive : boolean
+            If True, the sensitive mode of DIAMOND is used.
         keep_headers : boolean
             If True, indicates FASTA headers already have the format <genome_id>~<gene_id>.
         output_dir : str
@@ -407,7 +418,9 @@ def run(self, query_gene_files,
                                         tmp_dir,
                                         output_dir)
             else:
-                self._run_reciprocal_blasp(query_gene_file, 
+                self.logger.info('NOT YET IMPLEMENTED!')
+                sys.exit()
+                self._run_reciprocal_blastp(query_gene_file, 
                                             target_gene_file, 
                                             evalue, 
                                             per_identity, 
@@ -423,6 +436,7 @@ def run(self, query_gene_files,
                                         per_identity, 
                                         per_aln_len,
                                         len(target_gene_files) * 10,
+                                        sensitive,
                                         high_mem,
                                         tmp_dir,
                                         output_dir)
@@ -435,6 +449,7 @@ def run(self, query_gene_files,
                                                 per_identity, 
                                                 per_aln_len,
                                                 len(target_gene_files) * 10,
+                                                sensitive,
                                                 high_mem,
                                                 tmp_dir,
                                                 output_dir)