Skip to content

Commit

Permalink
Switched to subprocess to call MAFFT. Bio.Application was being used …
Browse files Browse the repository at this point in the history
…but it is deprecated since Biopython 1.82.
  • Loading branch information
rfm-targa committed Jan 17, 2024
1 parent 3a5474d commit 3472b9d
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 37 deletions.
24 changes: 13 additions & 11 deletions CHEWBBACA/AlleleCallEvaluator/evaluate_calls.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,28 +421,30 @@ def main(input_files, schema_directory, output_directory, annotations,

# Align sequences with MAFFT
print('\nDetermining the MSA for each locus...')
alignment_dir = fo.join_paths(temp_directory, ['alignment_files'])
fo.create_directory(alignment_dir)
alignmnet_inputs = im.divide_list_into_n_chunks(protein_files,
len(protein_files))
common_args = [alignment_dir]
mafft_outdir = fo.join_paths(temp_directory, ['alignment_files'])
fo.create_directory(mafft_outdir)
mafft_outfiles = [os.path.basename(file) for file in protein_files]
mafft_outfiles = [file.replace('.fasta', '_aligned.fasta') for file in mafft_outfiles]
mafft_outfiles = [fo.join_paths(mafft_outdir, [file]) for file in mafft_outfiles]

mafft_inputs = [[file, mafft_outfiles[i]]
for i, file in enumerate(protein_files)]

common_args = []
# Add common arguments to all sublists
inputs = im.multiprocessing_inputs(alignmnet_inputs,
inputs = im.multiprocessing_inputs(mafft_inputs,
common_args,
mw.call_mafft)

results = mo.map_async_parallelizer(inputs,
mo.function_helper,
cpu_cores,
show_progress=True)
mafft_files = {fo.file_basename(file, False).split('_protein_aligned')[0]: file
for file in results if file is not False}

print('\nCreating file with the full cgMLST alignment...', end='')
# Concatenate all alignment files and index with BioPython
concat_aln = fo.join_paths(alignment_dir, ['cgMLST_concat.fasta'])
fo.concatenate_files(mafft_files.values(), concat_aln)
concat_aln = fo.join_paths(mafft_outdir, ['cgMLST_concat.fasta'])
fo.concatenate_files(mafft_outfiles, concat_aln)
# Index file
concat_index = fao.index_fasta(concat_aln)
sample_alignment_files = []
Expand All @@ -463,7 +465,7 @@ def main(input_files, schema_directory, output_directory, annotations,
if no_tree is False:
print('Computing the NJ tree based on the core genome MSA...', end='')
# Compute NJ tree with FastTree
out_tree = fo.join_paths(alignment_dir, ['cgMLST.tree'])
out_tree = fo.join_paths(mafft_outdir, ['cgMLST.tree'])
fw.call_fasttree(full_alignment, out_tree)
phylo_data = fo.read_file(out_tree)
phylo_data = {"phylo_data": phylo_data}
Expand Down
13 changes: 9 additions & 4 deletions CHEWBBACA/SchemaEvaluator/evaluate_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,15 +496,20 @@ def locus_report(locus_file, locus_data, annotation_columns,
msa_data = {"sequences": []}
if light is False:
if locus_data[13][2] > 1 and locus_data[13][5] > 1:
output_directory = os.path.dirname(locus_data[18])
alignment_file = mw.call_mafft(locus_data[18], output_directory)
mafft_infile = locus_data[18]
output_directory = os.path.dirname(mafft_infile)
output_directory = os.path.abspath(output_directory)
mafft_outfile = os.path.basename(mafft_infile)
mafft_outfile = mafft_outfile.replace('.fasta', '_aligned.fasta')
mafft_outfile = fo.join_paths(output_directory, [mafft_outfile])
mafft_outfile_exists = mw.call_mafft(mafft_infile, mafft_outfile)
# Get MSA data
alignment_text = fo.read_file(alignment_file)
alignment_text = fo.read_file(mafft_outfile)
msa_data['sequences'] = alignment_text

# Get Tree data
# Get the phylocanvas data
tree_file = alignment_file.replace('_aligned.fasta', '.fasta.tree')
tree_file = mafft_outfile.replace('_aligned.fasta', '.fasta.tree')
phylo_data = fo.read_file(tree_file)

# Start by substituting greatest value to avoid substituting
Expand Down
42 changes: 20 additions & 22 deletions CHEWBBACA/utils/mafft_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,37 +13,35 @@


import os
from Bio.Align.Applications import MafftCommandline
import shutil
import subprocess

try:
from utils import iterables_manipulation as im
except ModuleNotFoundError:
from CHEWBBACA.utils import iterables_manipulation as im

def call_mafft(genefile, output_directory):

def call_mafft(input_file, output_file):
"""Call MAFFT to generate an alignment.
Parameters
----------
genefile : str
input_file : str
Path to a FASTA file with the sequences to align.
Returns
-------
Path to the file with the computed MSA if successful, False otherwise.
"""
try:
mafft_cline = MafftCommandline(input=genefile,
adjustdirection=True,
treeout=True,
thread=1,
retree=1,
maxiterate=0,
)
stdout, stderr = mafft_cline()
outfile_basename = os.path.basename(genefile)
outfile_basename = outfile_basename.replace('.fasta', '_aligned.fasta')
outfile = os.path.join(output_directory, outfile_basename)
with open(outfile, 'w') as handle:
handle.write(stdout)

return outfile
except Exception as e:
print(e)
return False
mafft_cmd = [shutil.which('mafft'), '--thread', '1', '--treeout', '--retree', '1',
'--maxiterate', '0', input_file, '>', output_file]
mafft_cmd = ' '.join(mafft_cmd)

mafft_cmd = subprocess.Popen(mafft_cmd,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
mafft_cmd.wait()

return os.path.exists(output_file)

0 comments on commit 3472b9d

Please sign in to comment.