Implementing check to verify if input file prefixes are interpreted a…

…s PDB IDs by BLAST when creating a database with makeblastdb.
B-UMMI · Jul 12, 2024 · d4b8676 · d4b8676
1 parent 3be2401
commit d4b8676
Show file tree

Hide file tree

Showing 4 changed files with 87 additions and 3 deletions.
diff --git a/CHEWBBACA/chewBBACA.py b/CHEWBBACA/chewBBACA.py
@@ -217,6 +217,11 @@ def msg(name=None):
 	blank_spaces = pv.check_blanks(args.input_files)
 	# Check if any input file has an unique prefix >= 50 characters
 	long_prefixes = pv.check_prefix_length(args.input_files)
+	# Check if any file prefixes are interpreted as PDB IDs
+	makeblastdb_path = fo.join_paths(args.blast_path, [ct.MAKEBLASTDB_ALIAS])
+	blastdbcmd_path = fo.join_paths(args.blast_path, [ct.BLASTDBCMD_ALIAS])
+	pdb_prefixes = pv.check_prefix_pdb(args.input_files, args.output_directory, makeblastdb_path, blastdbcmd_path)
+
 	print(f'Number of inputs: {total_inputs}')
 
 	# Add clustering parameters
@@ -237,13 +242,13 @@ def msg(name=None):
 		shutil.copy(args.ptf_path, schema_dir)
 		# Determine PTF checksum
 		ptf_hash = fo.hash_file(args.ptf_path, hashlib.blake2b())
-		print(f'Copied Prodigal training file to schema seed directory.')
+		print(f'Copied Prodigal training file to {schema_dir}')
 
 	# Write schema config file
 	args.minimum_length = ct.MSL_MIN
 	args.ptf_path = ptf_hash
 	schema_config = pv.write_schema_config(vars(args), __version__, schema_dir)
-	print(f'Wrote schema config file to {schema_config}')
+	print(f'Wrote schema config values to {schema_config[1]}')
 
 	# Create the file with the list of genes/loci
 	pv.write_gene_list(schema_dir)

diff --git a/CHEWBBACA/utils/blast_wrapper.py b/CHEWBBACA/utils/blast_wrapper.py
@@ -52,7 +52,7 @@ def make_blast_db(makeblastdb_path, input_fasta, output_path, db_type):
 	# v5 databases accept those IDs but replace '-' with '_', which is an issue when chewie is looking for the original IDs
 	makedb_cmd = [makeblastdb_path, '-in', input_fasta,
 				  '-out', output_path, '-parse_seqids',
-				  '-dbtype', db_type, '-blastdb_version', '4']
+				  '-dbtype', db_type, '-blastdb_version', '5']
 
 	makedb_process = subprocess.Popen(makedb_cmd,
 									  stdout=subprocess.PIPE,
@@ -217,3 +217,37 @@ def run_blastdb_aliastool(blastdb_aliastool_path, seqid_infile, seqid_outfile):
 				 f'{blastdb_aliastool_path} returned the following error:\n{stderr}')
 
 	return [stdout, stderr]
+
+
+def run_blastdbcmd(blastdbcmd_path, blast_db, output_file):
+	"""Run blastdbcmd to extract sequences from a BLAST database.
+
+	Parameters
+	----------
+	blastdbcmd_path : str
+		Path to the blastdbcmd executable.
+	blast_db : str
+		Path to the BLAST database.
+	output_file : str
+		Path to the output file that will store the sequences.
+
+	Returns
+	-------
+	stdout : bytes
+		BLAST stdout.
+	stderr : bytes or str
+		BLAST stderr.
+	"""
+	blastdbcmd_args = [blastdbcmd_path, '-db', blast_db, '-out', output_file, '-entry', 'all']
+
+	blastdbcmd_process = subprocess.Popen(blastdbcmd_args,
+										  stdout=subprocess.PIPE,
+										  stderr=subprocess.PIPE)
+
+	stdout, stderr = blastdbcmd_process.communicate()
+
+	# Exit if it is not possible to extract sequences from BLAST db
+	if len(stderr) > 0:
+		sys.exit(f'Cound not extract sequences from {blast_db}.\n')
+
+	return [stdout, stderr]
diff --git a/CHEWBBACA/utils/constants.py b/CHEWBBACA/utils/constants.py
@@ -146,6 +146,14 @@
 BLASTP_ALIAS = 'blastp.exe' if platform.system() == 'Windows' else shutil.which('blastp')
 MAKEBLASTDB_ALIAS = 'makeblastdb.exe' if platform.system() == 'Windows' else shutil.which('makeblastdb')
 BLASTDB_ALIASTOOL_ALIAS = 'blastdb_aliastool.exe' if platform.system() == 'Windows' else shutil.which('blastdb_aliastool')
+BLASTDBCMD_ALIAS = 'blastdbcmd.exe' if platform.system() == 'Windows' else shutil.which('blastdbcmd')
+
+# Protein to create dummy FASTA records used to check if sequence IDs are interpreted as PDB IDs
+DUMMY_PROT = 'MKFFYRPTGLAISINDAYQKVNFSTDGSSLRVDNPTPYFITYDQIKINGKSVKNVDMVAPYSQQTYPFKGARANETVQWTVVNDYGGDQKGESILH'
+DUMMY_FASTA = 'dummy.fasta'
+DUMMY_BLASTDB = 'dummy_db'
+DUMMY_DIR = 'dummy_dir'
+DUMMY_OUTPUT = 'dummy_output.fasta'
 
 # BLAST warnings to be ignored
 # This warning is raised in BLAST>=2.10 when passing a TXT file with sequence identifiers to -seqidlist
@@ -629,4 +637,10 @@
 					  '(e.g. BLAST does not accept sequence IDs longer than 50 '
 					  'characters when creating a database).')
 
+INPUTS_PDB_PREFIX = ('The following input files have a prefix that is '
+					 'interpreted as a PDB ID:\n{0}\nPlease ensure that '
+					 'the prefix is not a valid PDB ID. This is necessary '
+					 'to avoid issues related to how these IDs are recognized '
+					 'by chewBBACA and its dependencies.')
+
 MISSING_INPUT_ARG = ('Path to input files does not exist. Please provide a valid path.')
diff --git a/CHEWBBACA/utils/parameters_validation.py b/CHEWBBACA/utils/parameters_validation.py
@@ -24,13 +24,15 @@
 
 try:
 	from utils import (constants as ct,
+					   blast_wrapper as bw,
 					   gene_prediction as gp,
 					   file_operations as fo,
 					   chewiens_requests as cr,
 					   fasta_operations as fao,
 					   iterables_manipulation as im)
 except ModuleNotFoundError:
 	from CHEWBBACA.utils import (constants as ct,
+								 blast_wrapper as bw,
 								 gene_prediction as gp,
 								 file_operations as fo,
 								 chewiens_requests as cr,
@@ -1346,3 +1348,32 @@ def check_prefix_length(input_list):
 		sys.exit(ct.INPUTS_LONG_PREFIX.format(long_prefixes_msg))
 
 	return False
+
+
+def check_prefix_pdb(input_list, output_directory, makeblastdb_path, blastdbcmd_path):
+	"""
+	"""
+	input_paths = fo.read_lines(input_list)
+	prefixes = get_file_prefixes(input_paths)
+	print(prefixes)
+	# Create directory to store dummy data
+	dummy_dir = os.path.join(output_directory, ct.DUMMY_DIR)
+	fo.create_directory(dummy_dir)
+	# Create dummy FASTA records
+	dummy_seqids = [f'{i}-protein1' for i in (prefixes)]
+	dummy_records = [fao.fasta_str_record(ct.FASTA_RECORD_TEMPLATE, [i, ct.DUMMY_PROT]) for i in dummy_seqids]
+	dummy_file = os.path.join(dummy_dir, ct.DUMMY_FASTA)
+	fo.write_lines(dummy_records, dummy_file)
+	# Create BLAST db
+	blastdb = os.path.join(dummy_dir, ct.DUMMY_BLASTDB)
+	bw.make_blast_db(makeblastdb_path, dummy_file, blastdb, 'prot')
+	# Get sequence from BLAST db
+	output_fasta = os.path.join(dummy_dir, ct.DUMMY_OUTPUT)
+	blastdbcmd_std = bw.run_blastdbcmd(blastdbcmd_path, blastdb, output_fasta)
+	# Check if the sequence IDs in the BLAST db match the expected IDs
+	records = fao.sequence_generator(output_fasta)
+	recids = [record.id for record in records]
+	modified_seqids = set(dummy_seqids) - set(recids)
+	if len(modified_seqids) > 0:
+		pdb_prefix = [prefixes[p.split('-protein')[0]][0] for p in modified_seqids]
+		sys.exit(ct.INPUTS_PDB_PREFIX.format('\n'.join(pdb_prefix)))