Skip to content

Commit

Permalink
Merge pull request #294 from gbouras13/dev
Browse files Browse the repository at this point in the history
Version v1.5.0. Fixes #290. Fixes #293
  • Loading branch information
gbouras13 authored Sep 21, 2023
2 parents 885fef2 + 26adfc3 commit 22747b7
Show file tree
Hide file tree
Showing 28 changed files with 5,484 additions and 634 deletions.
12 changes: 12 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,18 @@
History
=======

1.5.0 (2023-09-20)
------------------

* Adds support for `pyrodigal-gv` implementing `prodigal-gv` as a gene predictor ([pyrodigal-gv](https://github.com/althonos/pyrodigal-gv) and [prodigal-gv](https://github.com/apcamargo/prodigal-gv)). This can be specified with `-g prodigal-gv`.
* Thanks to @[althonos](https://github.com/althonos) and @[apcamargo](https://github.com/apcamargo) for making this possible, and to @[asierFernandezP](https://github.com/asierFernandezP) for raising this as an issue in the first place [here](https://github.com/gbouras13/pharokka/issues/290) in #290.
* Adds checks to determine if your input FASTA has duplicated contig headers from #293 [here](https://github.com/gbouras13/pharokka/issues/293). Thanks @[thauptfeld](https://github.com/thauptfeld) for raising this.
* `-g prodigal` and `-g prodigal-gv` should be much faster thanks to multithread support added by the inimitable @althonos.
* Genbank format output will be designated with PHG not VRL (following this issue https://github.com/RyanCook94/inphared/issues/22).
* The `_length_gc_cds_density.tsv` and `_cds_final_merged_output.tsv` files now contain the translation table/genetic code for each contig (usually 11 but now not always if you use `pyrodigal-gv`).
* `--skip_mash` flag added to skip finding the closest match for each contig in INPHARED using mash.
* `--skip_extra_annotations` flag added to skip running tRNA-scanSE, MinCED and Aragorn in case you only want CDS predictions and functional annotations.

1.4.1 (2023-09-04)
------------------

Expand Down
168 changes: 102 additions & 66 deletions README.md

Large diffs are not rendered by default.

131 changes: 100 additions & 31 deletions bin/input_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
import subprocess as sp
from argparse import RawTextHelpFormatter

import pyrodigal
import pyrodigal_gv
from Bio import SeqIO
from loguru import logger
from pyrodigal import __version__
from util import get_version


Expand Down Expand Up @@ -60,7 +61,7 @@ def get_input():
"-g",
"--gene_predictor",
action="store",
help='User specified gene predictor. Use "-g phanotate" or "-g prodigal". \nDefaults to phanotate (not required unless prodigal is desired).',
help='User specified gene predictor. Use "-g phanotate" or "-g prodigal" or "-g prodigal-gv" or "-g genbank". \nDefaults to phanotate (not required unless prodigal is desired).',
default="phanotate",
)
parser.add_argument(
Expand All @@ -78,7 +79,7 @@ def get_input():
parser.add_argument(
"-c",
"--coding_table",
help="translation table for prodigal. Defaults to 11. Experimental only.",
help="translation table for prodigal. Defaults to 11.",
action="store",
default="11",
)
Expand Down Expand Up @@ -138,6 +139,16 @@ def get_input():
action="store",
default="nothing",
)
parser.add_argument(
"--skip_extra_annotations",
help="Skips tRNAscan-SE 2, MinCED and Aragorn.",
action="store_true",
),
parser.add_argument(
"--skip_mash",
help="Skips running mash to find the closest match for each contig in INPHARED.",
action="store_true",
)
parser.add_argument(
"-V",
"--version",
Expand Down Expand Up @@ -197,31 +208,64 @@ def instantiate_dirs(output_dir, meta, force):


def validate_fasta(filename):
if os.path.isfile(filename) == False: # if file doesnt exist
logger.error(f"Error: Input file {filename} does not exist. Please check your input.")
if os.path.isfile(filename) == False: # if file doesnt exist
logger.error(
f"Error: Input file {filename} does not exist. Please check your input."
)
else:
with open(filename, "r") as handle:
fasta = SeqIO.parse(handle, "fasta")
logger.info("Checking Input FASTA.")
logger.info(f"Checking input {filename}.")
if any(fasta):
logger.info("FASTA checked.")
logger.info(f"Input {filename} is in FASTA format.")
else:
logger.error("Error: Input file is not in the FASTA format.")

# check for duplicate headers
logger.info(f"Checking input {filename} for duplicate FASTA headers.")
check_duplicate_headers(filename)
logger.info(f"All headers in {filename} are unique.")


def check_duplicate_headers(fasta_file):
"""
checks if there are duplicated in the FASTA header
in response to Tina's issue
https://github.com/gbouras13/pharokka/issues/293
"""
header_set = set()

# Iterate through the FASTA file and check for duplicate headers
for record in SeqIO.parse(fasta_file, "fasta"):
header = record.description
if header in header_set:
logger.error(
f"Duplicate header found: {header}"
) # errors if duplicate header found
else:
header_set.add(header)
# if it finished it will be fine


def validate_gene_predictor(gene_predictor, genbank_flag):
if gene_predictor == "phanotate":
logger.info("Phanotate will be used for gene prediction.")
elif gene_predictor == "prodigal":
logger.info("Prodigal will be used for gene prediction.")
logger.info(
"Prodigal implemented with pyrodigal will be used for gene prediction."
)
elif gene_predictor == "prodigal-gv":
logger.info(
"Prodigal-gv implemented with pyrodigal-gv will be used for gene prediction."
)
elif gene_predictor == "genbank":
if genbank_flag is False:
logger.error(
"Error: you have specified -g genbank without --genbank. Please just use --genbank."
)
else:
logger.error(
"Error: gene predictor was incorrectly specified. Please use 'phanotate' or 'prodigal'."
"Error: gene predictor was incorrectly specified. Please use 'phanotate', 'prodigal' or 'prodigal-gv'."
)


Expand Down Expand Up @@ -300,8 +344,9 @@ def validate_threads(threads):
#######


def check_dependencies():
def check_dependencies(skip_mash):
"""Checks the dependencies and versions
skip_mash flag from args, won't check mash is skip mash specified
:return:
"""
#############
Expand All @@ -314,10 +359,10 @@ def check_dependencies():
except:
logger.error("Phanotate not found. Please reinstall pharokka.")
phan_out, _ = process.communicate()
phanotate_out = phan_out.decode().strip()
phanotate_major_version = int(phanotate_out.split(".")[0])
phanotate_minor_version = int(phanotate_out.split(".")[1])
phanotate_minorest_version = phanotate_out.split(".")[2]
phanotate_version = phan_out.decode().strip()
phanotate_major_version = int(phanotate_version.split(".")[0])
phanotate_minor_version = int(phanotate_version.split(".")[1])
phanotate_minorest_version = phanotate_version.split(".")[2]

logger.info(
f"Phanotate version found is v{phanotate_major_version}.{phanotate_minor_version}.{phanotate_minorest_version}"
Expand Down Expand Up @@ -471,25 +516,26 @@ def check_dependencies():
#############
# mash
#############
try:
process = sp.Popen(["mash", "--version"], stdout=sp.PIPE, stderr=sp.STDOUT)
except:
logger.error("mash not found. Please reinstall pharokka.")
if skip_mash is False:
try:
process = sp.Popen(["mash", "--version"], stdout=sp.PIPE, stderr=sp.STDOUT)
except:
logger.error("mash not found. Please reinstall pharokka.")

mash_out, _ = process.communicate()
mash_out = mash_out.decode().strip()
mash_out, _ = process.communicate()
mash_out = mash_out.decode().strip()

mash_major_version = int(mash_out.split(".")[0])
mash_minor_version = int(mash_out.split(".")[1])
mash_major_version = int(mash_out.split(".")[0])
mash_minor_version = int(mash_out.split(".")[1])

logger.info(f"mash version found is v{mash_major_version}.{mash_minor_version}")
logger.info(f"mash version found is v{mash_major_version}.{mash_minor_version}")

if mash_major_version != 2:
logger.error("mash is the wrong version. Please re-install pharokka.")
if mash_minor_version < 2:
logger.error("mash is the wrong version. Please re-install pharokka.")
if mash_major_version != 2:
logger.error("mash is the wrong version. Please re-install pharokka.")
if mash_minor_version < 2:
logger.error("mash is the wrong version. Please re-install pharokka.")

logger.info("mash version is ok.")
logger.info("mash version is ok.")

#############
# dnaapler
Expand Down Expand Up @@ -521,14 +567,37 @@ def check_dependencies():
# pyrodigal
#######

pyrodigal_major_version = int(__version__.split(".")[0])
pyrodigal_version = pyrodigal.__version__
pyrodigal_major_version = int(pyrodigal_version.split(".")[0])

if pyrodigal_major_version < 2:
if pyrodigal_major_version < 3:
logger.error("Pyrodigal is the wrong version. Please re-install pharokka.")

logger.info(f"Pyrodigal version is v{__version__}")
logger.info(f"Pyrodigal version is v{pyrodigal_version}")
logger.info(f"Pyrodigal version is ok.")

#######
# pyrodigal gv
#######

pyrodigal_gv_version = pyrodigal_gv.__version__
pyrodigal_major_version = int(pyrodigal_gv_version.split(".")[0])

if pyrodigal_major_version < 0:
logger.error("Pyrodigal_gv is the wrong version. Please re-install pharokka.")

logger.info(f"Pyrodigal_gv version is v{pyrodigal_gv_version}")
logger.info(f"Pyrodigal_gv version is ok.")

return (
phanotate_version,
pyrodigal_version,
pyrodigal_gv_version,
trna_version,
aragorn_version,
minced_version,
)


def instantiate_split_output(out_dir, split):
"""
Expand Down
83 changes: 57 additions & 26 deletions bin/pharokka.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
run_dnaapler, run_mash_dist, run_mash_sketch,
run_minced, run_mmseqs, run_phanotate,
run_phanotate_fasta_meta, run_phanotate_txt_meta,
run_pyrodigal, run_trna_scan, run_trnascan_meta,
split_input_fasta, translate_fastas)
run_pyrodigal, run_pyrodigal_gv, run_trna_scan,
run_trnascan_meta, split_input_fasta, translate_fastas)
from util import count_contigs, get_version


Expand Down Expand Up @@ -106,7 +106,15 @@ def main():

# dependencies
logger.info("Checking dependencies.")
check_dependencies()
# output versions
(
phanotate_version,
pyrodigal_version,
pyrodigal_gv_version,
trna_version,
aragorn_version,
minced_version,
) = check_dependencies(args.skip_mash)

# instantiation/checking fasta and gene_predictor
if args.genbank is True:
Expand Down Expand Up @@ -275,24 +283,30 @@ def main():
run_phanotate(input_fasta, out_dir, logdir)
elif gene_predictor == "prodigal":
logger.info("Implementing Prodigal using Pyrodigal.")
run_pyrodigal(input_fasta, out_dir, args.meta, args.coding_table)
run_pyrodigal(
input_fasta, out_dir, args.meta, args.coding_table, int(args.threads)
)
elif gene_predictor == "genbank":
logger.info("Extracting CDS information from your genbank file.")
elif gene_predictor == "prodigal-gv":
logger.info("Implementing Prodigal-gv using Pyrodigal-gv.")
run_pyrodigal_gv(input_fasta, out_dir, int(args.threads))

# translate fastas (parse genbank)
translate_fastas(out_dir, gene_predictor, args.coding_table, args.infile)

# run trna-scan meta mode if required
if args.meta == True:
logger.info("Starting tRNA-scanSE. Applying meta mode.")
run_trnascan_meta(input_fasta, out_dir, args.threads, num_fastas)
concat_trnascan_meta(out_dir, num_fastas)
else:
logger.info("Starting tRNA-scanSE.")
run_trna_scan(input_fasta, args.threads, out_dir, logdir)
# run minced and aragorn
run_minced(input_fasta, out_dir, prefix, logdir)
run_aragorn(input_fasta, out_dir, prefix, logdir)
if args.skip_extra_annotations is False:
if args.meta == True:
logger.info("Starting tRNA-scanSE. Applying meta mode.")
run_trnascan_meta(input_fasta, out_dir, args.threads, num_fastas)
concat_trnascan_meta(out_dir, num_fastas)
else:
logger.info("Starting tRNA-scanSE.")
run_trna_scan(input_fasta, args.threads, out_dir, logdir)
# run minced and aragorn
run_minced(input_fasta, out_dir, prefix, logdir)
run_aragorn(input_fasta, out_dir, prefix, logdir)

# running mmseqs2 on the 3 databases
if mmseqs_flag is True:
Expand Down Expand Up @@ -358,23 +372,36 @@ def main():
pharok.mmseqs_flag = mmseqs_flag
pharok.hmm_flag = hmm_flag
pharok.custom_hmm_flag = custom_hmm_flag
pharok.phanotate_version = phanotate_version
pharok.pyrodigal_version = pyrodigal_version
pharok.pyrodigal_gv_version = pyrodigal_gv_version
pharok.trna_version = trna_version
pharok.aragorn_version = aragorn_version
pharok.minced_version = minced_version
pharok.skip_extra_annotations = args.skip_extra_annotations

if pharok.hmm_flag is True:
pharok.pyhmmer_results_dict = best_results_pyhmmer
if pharok.custom_hmm_flag is True:
pharok.custom_pyhmmer_results_dict = best_results_custom_pyhmmer

#####################################
# post processing
#####################################

# gets df of length and gc for each contig
pharok.get_contig_name_lengths()

# post process results
# includes vfdb and card
# adds the merged df, vfdb and card top hits dfs to the class objec
# no need to specify params as they are in the class :)
pharok.process_results()

# gets df of length and gc for each contig
pharok.get_contig_name_lengths()

# parse the aragorn output
# get flag whether there is a tmrna from aragor
pharok.parse_aragorn()
# only if not skipping annots
if args.skip_extra_annotations is False:
pharok.parse_aragorn()

# create gff and save locustag to class for table
pharok.create_gff()
Expand Down Expand Up @@ -403,7 +430,7 @@ def main():
# convert to genbank
logger.info("Converting gff to genbank.")
# not part of the class so from processes.py
convert_gff_to_gbk(input_fasta, out_dir, out_dir, prefix, args.coding_table)
convert_gff_to_gbk(input_fasta, out_dir, out_dir, prefix, pharok.prot_seq_df)

# update fasta headers and final output tsv
pharok.update_fasta_headers()
Expand All @@ -413,12 +440,16 @@ def main():
pharok.extract_terl()

# run mash
logger.info("Finding the closest match for each contig in INPHARED using mash.")
# in process.py
run_mash_sketch(input_fasta, out_dir, logdir)
run_mash_dist(out_dir, db_dir, logdir)
# part of the class
pharok.inphared_top_hits()
if args.skip_mash is False: # skips mash
logger.info("Finding the closest match for each contig in INPHARED using mash.")
# in process.py
run_mash_sketch(input_fasta, out_dir, logdir)
run_mash_dist(out_dir, db_dir, logdir)
# part of the class
pharok.inphared_top_hits()
else:
logger.info("You have chosen --skip_mash.")
logger.info("Skipping finding the closest match for each contig in INPHARED using mash.")

# delete tmp files
remove_post_processing_files(out_dir, gene_predictor, args.meta)
Expand Down
Loading

0 comments on commit 22747b7

Please sign in to comment.