Skip to content

Commit

Permalink
Merge pull request #195 from CDCgov/dev-patch3-ick4
Browse files Browse the repository at this point in the history
write fasta straight to Genbank file instead of merging with metadata first
kyleoconnell authored Apr 9, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
2 parents 90d79ec + 9b3dc0a commit a68466d
Showing 9 changed files with 64 additions and 92 deletions.
6 changes: 3 additions & 3 deletions assets/sample_fastas/Cdiphtheriae/BX248355.fasta
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
>BX248355-1-segment2 Corynebacterium diphtheriae gravis NCTC13129, complete genome; segment 2/8
>BX248355 Corynebacterium diphtheriae gravis NCTC13129, complete genome; segment 2/8
TTATATGAGCCCTTCCTTATCGAGTCATCTGCATCAGCGGCCTCGGTATCGGCCACCTGCGGTTCACTTG
GTGGCAGATCCATCGTTACTGGCTCGCCACGAACACTCACTACTGTAGTAAGTAGGGGGCTAGGCCACAT
AAGTGGGAAAGGGTGAGTTATATCATATGAAAATTGCCCGAAATAGGAGTAAAGAATTTTGCCTGCGATA
@@ -4072,7 +4072,7 @@ ACCTCACCTCACCTGCGAGAGGTGAGGTAGAAAACGTAACTGCGACCGATTACAGCGGGATGTTGCCGTG
CTTCTTCGCAGGAACGTTAACAACCTTGCGGTCCAAGAGGCGTAGCGCTTCAATGATCTGACCACGGGTT
TCGCTTGGTGGAATAACGGCGTCGACGTAACCACGCTCGGCTGCCATGTAAGGGTTGACCAAAGTTTCTT
CGTACTCGCGCTCGTATTCCTTAGCTACAGCGAGCATGTCCTTGCCTTCGGCAGCAGCGGCCTTGATTTC
>BX248355-1-segment3 Corynebacterium diphtheriae gravis NCTC13129, complete genome; segment 3/8
>BX248355 Corynebacterium diphtheriae gravis NCTC13129, complete genome; segment 3/8
CTTGCGGTAGATGAATCCCACGGCACCAGCGGCGCCCATGACAGCGATCTGTGCGGTTGGCCATGCCAAG
ACGAGGTCGGCGCCCATGTCCTTCGAACCCATCACGCAGTAAGCGCCACCGTAGGACTTGCGGGTAATCA
CAGTGATCTTGCCGACGGTTGCTTCAGAGTAGGCGTAGAGGAGCTTGGCGCCACGGCGAATGATTCCGTC
@@ -4442,7 +4442,7 @@ CGATTATTGGGCATGGTGCTCGTATTGGAGCCAATGCTCGCATTACCGGCTGTGTGATTGGTGAAGGCGC
GGAAATCGGTGCACGTTGTGAGCTTCGCGACGGCATGCGTGTGTGGCCAGGAGTTGTGATTCCAACTGCT
GGTATCCGTTTTAGTTCGGATGCCTAAGTTGCAGTAATACCTAGTGCAACCTAGTCATAGGCTCCCACCA
CGGCAGGGTAGACCCCCTCGTTGTGGTGGGAGCTTTCGCATTTTATGGAGGCTGGCTGAAGCGATTTCTT
>BX248355-1-segment4 Corynebacterium diphtheriae gravis NCTC13129, complete genome; segment 4/8
>BX248355 Corynebacterium diphtheriae gravis NCTC13129, complete genome; segment 4/8
AGTAGCTGCTGAGTGCTAGCACGTTTGTGTTCAAACACTTTTATGCGACACGCCGATAATGAGCCGACTA
GCGAAGTATTTTTAGAGGCACCACTATATGTAGTACCCCCACCGTATACCCCTATGGTGGTGGCAATGTT
ACTGGTGTGACTAATGAAGTTAACAACCTGCGGAAATGCAATTTTTCCGATCGGAATTACATGTTCAGGT
Binary file modified assets/sample_metadata/Cdiphtheriae_test_1.xlsx
Binary file not shown.
2 changes: 1 addition & 1 deletion bin/submission.py
Original file line number Diff line number Diff line change
@@ -109,7 +109,7 @@ def args_parser():
help="Fasta file stored in submission directory",
required=True)
else:
print(f'fasta not required')
# fasta not required
file_parser.add_argument("--fasta_file",
help="Fasta file stored in submission directory",
required=False)
52 changes: 33 additions & 19 deletions bin/submission_create.py
Original file line number Diff line number Diff line change
@@ -123,8 +123,10 @@ def create_gisaid_submission(organism, database, submission_name, submission_dir
# Create submission files
gisaid_df.to_csv(os.path.join(submission_files_dir, "metadata.csv"), index=False, sep=",")
shutil.copy(os.path.join(submission_files_dir, "metadata.csv"), os.path.join(submission_files_dir, "orig_metadata.csv"))
create_fasta(organism=organism, database="GISAID", metadata=metadata, submission_files_dir=submission_files_dir, fasta_file=fasta_file)
shutil.copy(os.path.join(submission_files_dir, "sequence.fsa"), os.path.join(submission_files_dir, "orig_sequence.fsa"))
"""
create_fasta(organism=organism, database="GISAID", submission_files_dir=submission_files_dir, fasta_file=fasta_file)
"""
shutil.copy(fasta_file, os.path.join(submission_files_dir, "orig_sequence.fsa"))
print("\n"+"Creating submission files for " + database, file=sys.stdout)
print("Files are stored at: " + os.path.join(submission_files_dir), file=sys.stdout)

@@ -404,27 +406,37 @@ def create_authorset(config_dict, metadata, submission_name, submission_files_di
f.write(" }\n")
f.write("}\n")

# Create fasta file based on database
def create_fasta(organism, database, metadata, fasta_file, submission_files_dir):
# Make sure sequence name is found in fasta file header
fasta_df = submission_process.process_fasta_samples(metadata=metadata, fasta_file=fasta_file)
# Now replace fasta header with appropriate sequence ids
# Extract the required fields for specified database
db_required_colnames = submission_process.get_required_colnames(database=database, organism=organism)
# Get the sample names with "#" symbol
sample_colname = list(filter(lambda x: ("#" in x)==True, db_required_colnames))[0].replace("#","").replace("*","")
# Create fasta file
records = []
for index, row in fasta_df.iterrows():
records.append(SeqRecord(row["fasta_sequence_orig"], id = row[sample_colname], description = ""))
with open(os.path.join(submission_files_dir, "sequence.fsa"), "w+") as f:
SeqIO.write(records, f, "fasta")
# get locus tag from gff file for Table2asn submission
def get_gff_locus_tag(gff_file):
""" Read the locus lag from the GFF3 file for use in table2asn command"""
locus_tag = None
with open(gff_file, 'r') as file:
for line in file:
if line.startswith('##FASTA'):
break # Stop reading if FASTA section starts
elif line.startswith('#'):
continue # Skip comment lines
else:
columns = line.strip().split('\t')
if columns[2] == 'CDS':
attributes = columns[8].split(';')
for attribute in attributes:
key, value = attribute.split('=')
if key == 'locus_tag':
locus_tag = value.split('_')[0]
break # Found locus tag, stop searching
if locus_tag:
break # Found locus tag, stop searching
return locus_tag

# Create a zip file for genbank submission
def create_genbank_files(organism, config_dict, metadata, fasta_file, submission_name, submission_files_dir):
# Create authorset file
create_authorset(config_dict=config_dict, metadata=metadata, submission_name=submission_name, submission_files_dir=submission_files_dir)
create_fasta(organism=organism, database=["GENBANK"], metadata=metadata, fasta_file=fasta_file, submission_files_dir=submission_files_dir)
"""
create_fasta(organism=organism, database=["GENBANK"], fasta_file=fasta_file, submission_files_dir=submission_files_dir)
"""
shutil.copy(fasta_file, os.path.join(submission_files_dir, "sequence.fsa"))
# Retrieve the source df
source_df = metadata.filter(regex="^gb-seq_id$|^src-|^ncbi-spuid$|^ncbi-bioproject$|^organism$|^collection_date$").copy()
source_df.columns = source_df.columns.str.replace("src-","").str.strip()
@@ -469,7 +481,9 @@ def create_genbank_table2asn(submission_dir, submission_name, submission_files_d
print("Downloading Table2asn.", file=sys.stdout)
download_table2asn(table2asn_dir=table2asn_dir)
# Command to generate table2asn submission file
command = [table2asn_dir, "-t", os.path.join(submission_files_dir, "authorset.sbt"), "-i", os.path.join(submission_files_dir, "sequence.fsa"), "-src-file", os.path.join(submission_files_dir, "source.src"), "-o", os.path.join(submission_files_dir, submission_name + ".sqn")]
command = [table2asn_dir, "-t", os.path.join(submission_files_dir, "authorset.sbt"), "-i", os.path.join(submission_files_dir, "sequence.fsa"), \
"-src-file", os.path.join(submission_files_dir, "source.src"), "-locus-tag-prefix", get_gff_locus_tag(gff_file), \
"-o", os.path.join(submission_files_dir, submission_name + ".sqn")]
if os.path.isfile(os.path.join(submission_files_dir, "comment.cmt")):
command.append("-w")
command.append( os.path.join(submission_files_dir, "comment.cmt"))
32 changes: 2 additions & 30 deletions bin/submission_process.py
Original file line number Diff line number Diff line change
@@ -248,36 +248,8 @@ def check_raw_read_files(submission_name, submission_dir, metadata):
return validated_files

# Check sample names in metadata file are listed in fasta file
def process_fasta_samples(metadata, fasta_file):
fasta_dict = []
# Convert fasta into df
with open(fasta_file, "r") as fsa:
records = SeqIO.parse(fsa, "fasta")
for record in records:
fasta_dict.append({"fasta_name_orig":record.id, "fasta_sequence_orig":record.seq, "fasta_description_orig":record.description})
fasta_df = pd.DataFrame(fasta_dict)
# Remove rows if they contain all Nan
fasta_df = fasta_df.dropna(how='all')
# Check duplicates in fasta_df
duplicated_df = fasta_df[fasta_df.duplicated(subset = ["fasta_name_orig"], keep = False)]
if not duplicated_df.empty:
print("Error: Sequences in fasta file must be unique at: " + fasta_file + "\nDuplicate Sequences\n" + fasta_df["fasta_sequence_orig"].to_string(index=False), file=sys.stderr)
sys.exit(1)
# Validate duplicates don't appear on merge
try:
merged_df = metadata.merge(fasta_df, how = "outer", left_on = "sequence_name", right_on = "fasta_name_orig", validate = "1:1")
except:
print("Error: Unable to merge fasta file to metadata file. Please validate there are not duplicate sequences in both files.", file=sys.stderr)
sys.exit(1)
# Check if fasta file has sequences not in metadata
if merged_df["sequence_name"].isnull().any():
print("Error: Sequences in fasta file do not have an associated sequence in metadata file. Please update sequences below:\n" + merged_df[merged_df["sequence_name"].isnull()]["fasta_name_orig"].to_string(), file=sys.stderr)
sys.exit(1)
# Check if metadata has sequences not in fasta file
if merged_df["fasta_name_orig"].isnull().any():
print("Error: Sequences in metadata file do not have an associated sequence in fasta file. Please update sequences below:\n" + merged_df[merged_df["fasta_name_orig"].isnull()]["sequence_name"].to_string(), file=sys.stderr)
sys.exit(1)
return merged_df
# Removed process_fasta_samples(metadata, fasta_file), originally referenced in submission_create.create_fasta
# Only purpose of this function was to check that fasta matched metadata, and this is done in nextflow now

# Update submission log
def update_submission_status(submission_dir, submission_name, organism, test, send_email=False):
6 changes: 3 additions & 3 deletions conf/modules.config
Original file line number Diff line number Diff line change
@@ -20,23 +20,23 @@ process {

withName: METADATA_VALIDATION {
publishDir = [
path: { "${params.output_dir}" },
path: { "${params.output_dir}/${params.validation_outputs}" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: BAKTA {
publishDir = [
path: { "${params.output_dir/bakta_outputs}" },
path: { "${params.output_dir}/${params.bakta_outputs}" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: VADR_POST_CLEANUP {
publishDir = [
path: { "${params.output_dir}" },
path: { "${params.output_dir}/${params.vadr_outputs}" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
14 changes: 8 additions & 6 deletions conf/test_params.config
Original file line number Diff line number Diff line change
@@ -148,18 +148,20 @@ params {
download_bakta_db = false
bakta_db_path = ""
bakta_min_contig_length = 200 // due to compliant mode default, otherwise default is 1
bakta_threads = 2
bakta_gram = "?"
bakta_genus = "Genus"
bakta_species = "species"
bakta_strain = "strain"
bakta_plasmid = "unnamed"
bakta_complete = ""
bakta_compliant = ""
bakta_translation_table = 11
bakta_gram = "?"
bakta_locus = "contig"
bakta_locus_tag = "LOCUSTAG123" // set a meaningful locus tag here for compliant mode
bakta_keep_contig_headers = ""
bakta_translation_table = 11

// optional bakta params, for now need to add these to modules/nf-core/bakta/bakta/main.nf
bakta_complete = ""
bakta_compliant = ""
bakta_keep_contig_headers = ""
bakta_prodigal_df = ""
bakta_replicons = ""
bakta_proteins = ""
@@ -174,7 +176,7 @@ params {
bakta_skip_sorf = ""
bakta_skip_gap = ""
bakta_skip_ori = ""
bakta_skip_plot = ""
bakta_skip_plot = true

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
43 changes: 14 additions & 29 deletions modules/nf-core/bakta/bakta/main.nf
Original file line number Diff line number Diff line change
@@ -15,8 +15,6 @@ process BAKTA {
'https://depot.galaxyproject.org/singularity/bakta:1.9.1--pyhdfd78af_0' :
'quay.io/biocontainers/bakta:1.9.1--pyhdfd78af_0' }"

publishDir "$params.output_dir/$params.bakta_output_dir", mode: 'copy', overwrite: params.overwrite_output

input:
path db_path
tuple val(meta), path(fasta_path), path(fastq1), path(fastq2)
@@ -38,6 +36,7 @@ process BAKTA {
def skip_ori = params.bakta_skip_ori ? "--skip-ori" : ""
def compliant = params.bakta_compliant ? "--compliant" : ""
def complete = params.bakta_complete ? "--complete" : ""
def skip_plot = params.bakta_skip_plot ? "--skip-plot" : ""
def keep_contig_headers = params.bakta_keep_contig_headers ? "--keep-contig-headers" : ""

"""
@@ -53,34 +52,20 @@ process BAKTA {
--gram $params.bakta_gram \
--locus $params.bakta_locus \
--locus-tag $params.bakta_locus_tag \
$complete \
$compliant \
$keep_contig_headers \
$proteins \
$prodigal_tf \
$skip_trna \
$skip_rrna \
$skip_ncrna \
$skip_ncrna_region \
$skip_crispr \
$skip_cds \
$skip_sorf \
$skip_gap \
$skip_ori \
$fasta_path
$complete $compliant $keep_contig_headers $proteins $prodigal_tf $skip_trna $skip_rrna \
$skip_ncrna $skip_ncrna_region $skip_crispr $skip_cds $skip_sorf $skip_gap $skip_ori $skip_plot \
$fasta_path
"""

output:
path "${fasta_path.getSimpleName()}/*.fna", emit: fna
path "${fasta_path.getSimpleName()}/*.gff3", emit: gff3
path "${fasta_path.getSimpleName()}/*.faa", emit: faa
path "${fasta_path.getSimpleName()}/*.embl", emit: embl
path "${fasta_path.getSimpleName()}/*.ffn", emit: ffn
path "${fasta_path.getSimpleName()}/*.gbff", emit: gbff
path "${fasta_path.getSimpleName()}/*.json", emit: json
path "${fasta_path.getSimpleName()}/*.log", emit: log
path "${fasta_path.getSimpleName()}/*.png", emit: png
path "${fasta_path.getSimpleName()}/*.svg", emit: svg
path "${fasta_path.getSimpleName()}/*.tsv", emit: tsv
path "${fasta_path.getSimpleName()}/*.txt", emit: txt
path "${meta.id}/*.fna", emit: fna
path "${meta.id}/*.gff3", emit: gff3
path "${meta.id}/*.faa", emit: faa
path "${meta.id}/*.embl", emit: embl
path "${meta.id}/*.ffn", emit: ffn
path "${meta.id}/*.gbff", emit: gbff
path "${meta.id}/*.json", emit: json
path "${meta.id}/*.log", emit: log
path "${meta.id}/*.tsv", emit: tsv
path "${meta.id}/*.txt", emit: txt
}
1 change: 0 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
@@ -101,7 +101,6 @@ params {
// optional bakta params
bakta_complete = ""
bakta_compliant = ""
bakta_complete = ""
bakta_keep_contig_headers = ""
bakta_prodigal_tf = ""
bakta_replicons = ""

0 comments on commit a68466d

Please sign in to comment.