Skip to content

Commit

Permalink
Implementing check to verify if input file prefixes are interpreted a…
Browse files Browse the repository at this point in the history
…s PDB IDs by BLAST when creating a database with makeblastdb.
  • Loading branch information
rfm-targa committed Jul 12, 2024
1 parent 3be2401 commit d4b8676
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 3 deletions.
9 changes: 7 additions & 2 deletions CHEWBBACA/chewBBACA.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,11 @@ def msg(name=None):
blank_spaces = pv.check_blanks(args.input_files)
# Check if any input file has an unique prefix >= 50 characters
long_prefixes = pv.check_prefix_length(args.input_files)
# Check if any file prefixes are interpreted as PDB IDs
makeblastdb_path = fo.join_paths(args.blast_path, [ct.MAKEBLASTDB_ALIAS])
blastdbcmd_path = fo.join_paths(args.blast_path, [ct.BLASTDBCMD_ALIAS])
pdb_prefixes = pv.check_prefix_pdb(args.input_files, args.output_directory, makeblastdb_path, blastdbcmd_path)

print(f'Number of inputs: {total_inputs}')

# Add clustering parameters
Expand All @@ -237,13 +242,13 @@ def msg(name=None):
shutil.copy(args.ptf_path, schema_dir)
# Determine PTF checksum
ptf_hash = fo.hash_file(args.ptf_path, hashlib.blake2b())
print(f'Copied Prodigal training file to schema seed directory.')
print(f'Copied Prodigal training file to {schema_dir}')

# Write schema config file
args.minimum_length = ct.MSL_MIN
args.ptf_path = ptf_hash
schema_config = pv.write_schema_config(vars(args), __version__, schema_dir)
print(f'Wrote schema config file to {schema_config}')
print(f'Wrote schema config values to {schema_config[1]}')

# Create the file with the list of genes/loci
pv.write_gene_list(schema_dir)
Expand Down
36 changes: 35 additions & 1 deletion CHEWBBACA/utils/blast_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def make_blast_db(makeblastdb_path, input_fasta, output_path, db_type):
# v5 databases accept those IDs but replace '-' with '_', which is an issue when chewie is looking for the original IDs
makedb_cmd = [makeblastdb_path, '-in', input_fasta,
'-out', output_path, '-parse_seqids',
'-dbtype', db_type, '-blastdb_version', '4']
'-dbtype', db_type, '-blastdb_version', '5']

makedb_process = subprocess.Popen(makedb_cmd,
stdout=subprocess.PIPE,
Expand Down Expand Up @@ -217,3 +217,37 @@ def run_blastdb_aliastool(blastdb_aliastool_path, seqid_infile, seqid_outfile):
f'{blastdb_aliastool_path} returned the following error:\n{stderr}')

return [stdout, stderr]


def run_blastdbcmd(blastdbcmd_path, blast_db, output_file):
"""Run blastdbcmd to extract sequences from a BLAST database.
Parameters
----------
blastdbcmd_path : str
Path to the blastdbcmd executable.
blast_db : str
Path to the BLAST database.
output_file : str
Path to the output file that will store the sequences.
Returns
-------
stdout : bytes
BLAST stdout.
stderr : bytes or str
BLAST stderr.
"""
blastdbcmd_args = [blastdbcmd_path, '-db', blast_db, '-out', output_file, '-entry', 'all']

blastdbcmd_process = subprocess.Popen(blastdbcmd_args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)

stdout, stderr = blastdbcmd_process.communicate()

# Exit if it is not possible to extract sequences from BLAST db
if len(stderr) > 0:
sys.exit(f'Cound not extract sequences from {blast_db}.\n')

return [stdout, stderr]
14 changes: 14 additions & 0 deletions CHEWBBACA/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,14 @@
BLASTP_ALIAS = 'blastp.exe' if platform.system() == 'Windows' else shutil.which('blastp')
MAKEBLASTDB_ALIAS = 'makeblastdb.exe' if platform.system() == 'Windows' else shutil.which('makeblastdb')
BLASTDB_ALIASTOOL_ALIAS = 'blastdb_aliastool.exe' if platform.system() == 'Windows' else shutil.which('blastdb_aliastool')
BLASTDBCMD_ALIAS = 'blastdbcmd.exe' if platform.system() == 'Windows' else shutil.which('blastdbcmd')

# Protein to create dummy FASTA records used to check if sequence IDs are interpreted as PDB IDs
DUMMY_PROT = 'MKFFYRPTGLAISINDAYQKVNFSTDGSSLRVDNPTPYFITYDQIKINGKSVKNVDMVAPYSQQTYPFKGARANETVQWTVVNDYGGDQKGESILH'
DUMMY_FASTA = 'dummy.fasta'
DUMMY_BLASTDB = 'dummy_db'
DUMMY_DIR = 'dummy_dir'
DUMMY_OUTPUT = 'dummy_output.fasta'

# BLAST warnings to be ignored
# This warning is raised in BLAST>=2.10 when passing a TXT file with sequence identifiers to -seqidlist
Expand Down Expand Up @@ -629,4 +637,10 @@
'(e.g. BLAST does not accept sequence IDs longer than 50 '
'characters when creating a database).')

INPUTS_PDB_PREFIX = ('The following input files have a prefix that is '
'interpreted as a PDB ID:\n{0}\nPlease ensure that '
'the prefix is not a valid PDB ID. This is necessary '
'to avoid issues related to how these IDs are recognized '
'by chewBBACA and its dependencies.')

MISSING_INPUT_ARG = ('Path to input files does not exist. Please provide a valid path.')
31 changes: 31 additions & 0 deletions CHEWBBACA/utils/parameters_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,15 @@

try:
from utils import (constants as ct,
blast_wrapper as bw,
gene_prediction as gp,
file_operations as fo,
chewiens_requests as cr,
fasta_operations as fao,
iterables_manipulation as im)
except ModuleNotFoundError:
from CHEWBBACA.utils import (constants as ct,
blast_wrapper as bw,
gene_prediction as gp,
file_operations as fo,
chewiens_requests as cr,
Expand Down Expand Up @@ -1346,3 +1348,32 @@ def check_prefix_length(input_list):
sys.exit(ct.INPUTS_LONG_PREFIX.format(long_prefixes_msg))

return False


def check_prefix_pdb(input_list, output_directory, makeblastdb_path, blastdbcmd_path):
"""
"""
input_paths = fo.read_lines(input_list)
prefixes = get_file_prefixes(input_paths)
print(prefixes)
# Create directory to store dummy data
dummy_dir = os.path.join(output_directory, ct.DUMMY_DIR)
fo.create_directory(dummy_dir)
# Create dummy FASTA records
dummy_seqids = [f'{i}-protein1' for i in (prefixes)]
dummy_records = [fao.fasta_str_record(ct.FASTA_RECORD_TEMPLATE, [i, ct.DUMMY_PROT]) for i in dummy_seqids]
dummy_file = os.path.join(dummy_dir, ct.DUMMY_FASTA)
fo.write_lines(dummy_records, dummy_file)
# Create BLAST db
blastdb = os.path.join(dummy_dir, ct.DUMMY_BLASTDB)
bw.make_blast_db(makeblastdb_path, dummy_file, blastdb, 'prot')
# Get sequence from BLAST db
output_fasta = os.path.join(dummy_dir, ct.DUMMY_OUTPUT)
blastdbcmd_std = bw.run_blastdbcmd(blastdbcmd_path, blastdb, output_fasta)
# Check if the sequence IDs in the BLAST db match the expected IDs
records = fao.sequence_generator(output_fasta)
recids = [record.id for record in records]
modified_seqids = set(dummy_seqids) - set(recids)
if len(modified_seqids) > 0:
pdb_prefix = [prefixes[p.split('-protein')[0]][0] for p in modified_seqids]
sys.exit(ct.INPUTS_PDB_PREFIX.format('\n'.join(pdb_prefix)))

0 comments on commit d4b8676

Please sign in to comment.