Switched to subprocess to call MAFFT. Bio.Application was being used …

…but it is deprecated since Biopython 1.82.
B-UMMI · Jan 17, 2024 · 3472b9d · 3472b9d
1 parent 3a5474d
commit 3472b9d
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 37 deletions.
diff --git a/CHEWBBACA/AlleleCallEvaluator/evaluate_calls.py b/CHEWBBACA/AlleleCallEvaluator/evaluate_calls.py
@@ -421,28 +421,30 @@ def main(input_files, schema_directory, output_directory, annotations,
 
             # Align sequences with MAFFT
             print('\nDetermining the MSA for each locus...')
-            alignment_dir = fo.join_paths(temp_directory, ['alignment_files'])
-            fo.create_directory(alignment_dir)
-            alignmnet_inputs = im.divide_list_into_n_chunks(protein_files,
-                                                            len(protein_files))
-            common_args = [alignment_dir]
+            mafft_outdir = fo.join_paths(temp_directory, ['alignment_files'])
+            fo.create_directory(mafft_outdir)
+            mafft_outfiles = [os.path.basename(file) for file in protein_files]
+            mafft_outfiles = [file.replace('.fasta', '_aligned.fasta') for file in mafft_outfiles]
+            mafft_outfiles = [fo.join_paths(mafft_outdir, [file]) for file in mafft_outfiles]
 
+            mafft_inputs = [[file, mafft_outfiles[i]]
+                             for i, file in enumerate(protein_files)]
+
+            common_args = []
             # Add common arguments to all sublists
-            inputs = im.multiprocessing_inputs(alignmnet_inputs,
+            inputs = im.multiprocessing_inputs(mafft_inputs,
                                                common_args,
                                                mw.call_mafft)
 
             results = mo.map_async_parallelizer(inputs,
                                                 mo.function_helper,
                                                 cpu_cores,
                                                 show_progress=True)
-            mafft_files = {fo.file_basename(file, False).split('_protein_aligned')[0]: file
-                           for file in results if file is not False}
 
             print('\nCreating file with the full cgMLST alignment...', end='')
             # Concatenate all alignment files and index with BioPython
-            concat_aln = fo.join_paths(alignment_dir, ['cgMLST_concat.fasta'])
-            fo.concatenate_files(mafft_files.values(), concat_aln)
+            concat_aln = fo.join_paths(mafft_outdir, ['cgMLST_concat.fasta'])
+            fo.concatenate_files(mafft_outfiles, concat_aln)
             # Index file
             concat_index = fao.index_fasta(concat_aln)
             sample_alignment_files = []
@@ -463,7 +465,7 @@ def main(input_files, schema_directory, output_directory, annotations,
             if no_tree is False:
                 print('Computing the NJ tree based on the core genome MSA...', end='')
                 # Compute NJ tree with FastTree
-                out_tree = fo.join_paths(alignment_dir, ['cgMLST.tree'])
+                out_tree = fo.join_paths(mafft_outdir, ['cgMLST.tree'])
                 fw.call_fasttree(full_alignment, out_tree)
                 phylo_data = fo.read_file(out_tree)
                 phylo_data = {"phylo_data": phylo_data}

diff --git a/CHEWBBACA/SchemaEvaluator/evaluate_schema.py b/CHEWBBACA/SchemaEvaluator/evaluate_schema.py
@@ -496,15 +496,20 @@ def locus_report(locus_file, locus_data, annotation_columns,
     msa_data = {"sequences": []}
     if light is False:
         if locus_data[13][2] > 1 and locus_data[13][5] > 1:
-            output_directory = os.path.dirname(locus_data[18])
-            alignment_file = mw.call_mafft(locus_data[18], output_directory)
+            mafft_infile = locus_data[18]
+            output_directory = os.path.dirname(mafft_infile)
+            output_directory = os.path.abspath(output_directory)
+            mafft_outfile = os.path.basename(mafft_infile)
+            mafft_outfile = mafft_outfile.replace('.fasta', '_aligned.fasta')
+            mafft_outfile = fo.join_paths(output_directory, [mafft_outfile])
+            mafft_outfile_exists = mw.call_mafft(mafft_infile, mafft_outfile)
             # Get MSA data
-            alignment_text = fo.read_file(alignment_file)
+            alignment_text = fo.read_file(mafft_outfile)
             msa_data['sequences'] = alignment_text
 
             # Get Tree data
             # Get the phylocanvas data
-            tree_file = alignment_file.replace('_aligned.fasta', '.fasta.tree')
+            tree_file = mafft_outfile.replace('_aligned.fasta', '.fasta.tree')
             phylo_data = fo.read_file(tree_file)
 
             # Start by substituting greatest value to avoid substituting

diff --git a/CHEWBBACA/utils/mafft_wrapper.py b/CHEWBBACA/utils/mafft_wrapper.py
@@ -13,37 +13,35 @@
 
 
 import os
-from Bio.Align.Applications import MafftCommandline
+import shutil
+import subprocess
 
+try:
+    from utils import iterables_manipulation as im
+except ModuleNotFoundError:
+    from CHEWBBACA.utils import iterables_manipulation as im
 
-def call_mafft(genefile, output_directory):
+
+def call_mafft(input_file, output_file):
     """Call MAFFT to generate an alignment.
 
     Parameters
     ----------
-    genefile : str
+    input_file : str
         Path to a FASTA file with the sequences to align.
 
     Returns
     -------
     Path to the file with the computed MSA if successful, False otherwise.
     """
-    try:
-        mafft_cline = MafftCommandline(input=genefile,
-                                       adjustdirection=True,
-                                       treeout=True,
-                                       thread=1,
-                                       retree=1,
-                                       maxiterate=0,
-                                       )
-        stdout, stderr = mafft_cline()
-        outfile_basename = os.path.basename(genefile)
-        outfile_basename = outfile_basename.replace('.fasta', '_aligned.fasta')
-        outfile = os.path.join(output_directory, outfile_basename)
-        with open(outfile, 'w') as handle:
-            handle.write(stdout)
-
-        return outfile
-    except Exception as e:
-        print(e)
-        return False
+    mafft_cmd = [shutil.which('mafft'), '--thread', '1', '--treeout', '--retree', '1',
+                 '--maxiterate', '0', input_file, '>', output_file]
+    mafft_cmd = ' '.join(mafft_cmd)
+
+    mafft_cmd = subprocess.Popen(mafft_cmd,
+                                 shell=True,
+                                 stdout=subprocess.PIPE,
+                                 stderr=subprocess.PIPE)
+    mafft_cmd.wait()
+
+    return os.path.exists(output_file)