Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Write compound (segmented) sequence locations #1438

Merged
merged 1 commit into from
Mar 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,14 @@
* filter: Updated docs with an example of tiered subsampling. [#1425][] (@victorlin)
* export: Fixes bug [#1433] introduced in v23.1.0, that causes validation to fail when gene names start with `nuc`, e.g. `nucleocapsid`. [#1434][] (@corneliusroemer)
* import: Fixes bug introduced in v24.2.0 that prevented `import beast` from running. [#1439][] (@tomkinsc)
* translate, ancestral: Compound CDS are now exported as segmented CDS and are now viewable in Auspice. [#1438][] (@jameshadfield)

[#1425]: https://github.com/nextstrain/augur/pull/1425
[#1429]: https://github.com/nextstrain/augur/pull/1429
[#1433]: https://github.com/nextstrain/augur/issues/1433
[#1434]: https://github.com/nextstrain/augur/pull/1434
[#1436]: https://github.com/nextstrain/augur/pull/1436
[#1438]: https://github.com/nextstrain/augur/pull/1438
[#1439]: https://github.com/nextstrain/augur/pull/1439

## 24.2.3 (23 February 2024)
Expand Down
12 changes: 3 additions & 9 deletions augur/ancestral.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from .utils import parse_genes_argument, read_tree, InvalidTreeError, write_json, get_json_name
from .utils import parse_genes_argument, read_tree, InvalidTreeError, write_json, get_json_name, \
genome_features_to_auspice_annotation
from .io.file import open_file
from .io.vcf import is_vcf as is_filename_vcf
from treetime.vcf_utils import read_vcf, write_vcf
Expand Down Expand Up @@ -455,14 +456,7 @@ def run(args):
node["aa_sequences"][gene] = aa_result['tt'].sequence(T.root, as_string=True, reconstructed=True)

anc_seqs['reference'][gene] = aa_result['root_seq']
# FIXME: Note that this is calculating the end of the CDS as 3*length of translation
# this is equivalent to the annotation for single segment CDS, but not for cds
# with splicing and slippage. But auspice can't handle the latter at the moment.
anc_seqs['annotations'][gene] = {'seqid':args.annotation,
'type':feat.type,
'start': int(feat.location.start)+1,
'end': int(feat.location.start) + 3*len(anc_seqs['reference'][gene]),
'strand': {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]}
anc_seqs['annotations'].update(genome_features_to_auspice_annotation({gene: feat}, args.annotation))

# Save ancestral amino acid sequences to FASTA.
if args.output_translations:
Expand Down
22 changes: 3 additions & 19 deletions augur/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
import numpy as np
from Bio import SeqIO, Seq, SeqRecord, Phylo
from .io.vcf import write_VCF_translation, is_vcf as is_filename_vcf
from .utils import parse_genes_argument, read_node_data, load_features, write_json, get_json_name
from .utils import parse_genes_argument, read_node_data, load_features, \
write_json, get_json_name, genome_features_to_auspice_annotation
from treetime.vcf_utils import read_vcf
from augur.errors import AugurError
from textwrap import dedent
Expand Down Expand Up @@ -446,24 +447,7 @@ def run(args):
continue
reference_translations[fname] = safe_translate(str(feat.extract(reference)))

## glob the annotations for later auspice export
#
# Note that BioPython FeatureLocations use
# "Pythonic" coordinates: [zero-origin, half-open)
# Starting with augur v6 we use GFF coordinates: [one-origin, inclusive]
annotations = {
'nuc': {'start': features['nuc'].location.start+1,
'end': features['nuc'].location.end,
'strand': '+',
'type': features['nuc'].type, # (unused by auspice)
'seqid': args.reference_sequence} # (unused by auspice)
}
for fname, feat in features.items():
annotations[fname] = {'seqid':args.reference_sequence,
'type':feat.type,
'start':int(feat.location.start)+1,
'end':int(feat.location.end),
'strand': {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]}
annotations = genome_features_to_auspice_annotation(features, args.reference_sequence, assert_nuc=True)

## determine amino acid mutations for each node
try:
Expand Down
54 changes: 54 additions & 0 deletions augur/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -832,3 +832,57 @@ def _get_genes_from_file(fname):
print("Read in {} specified genes to translate.".format(len(unique_genes)))

return unique_genes



def genome_features_to_auspice_annotation(features, ref_seq_name=None, assert_nuc=False):
"""
Parameters
----------
features : dict
keys: feature names, values: Bio.SeqFeature.SeqFeature objects
ref_seq_name : str (optional)
Exported as the `seqid` for each feature. Note this is unused by Auspice
assert_nuc : bool (optional)
If true, one of the feature key names must be "nuc"

Returns
-------
annotations: dict
See schema-annotations.json for the schema this conforms to

"""
from Bio.SeqFeature import SimpleLocation, CompoundLocation

if assert_nuc and 'nuc' not in features:
raise AugurError("Genome features must include a feature for 'nuc'")

def _parse(feat):
a = {}
# Note that BioPython locations use "Pythonic" coordinates: [zero-origin, half-open)
# Starting with augur v6 we use GFF coordinates: [one-origin, inclusive]
if type(feat.location)==SimpleLocation:
a['start'] = int(feat.location.start)+1
a['end'] = int(feat.location.end)
elif type(feat.location)==CompoundLocation:
a['segments'] = [
{'start':int(segment.start)+1, 'end':int(segment.end)}
for segment in feat.location.parts # segment: SimpleLocation
]
else:
raise AugurError(f"Encountered a genome feature with an unknown location type {type(feat.location):q}")
a['strand'] = {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]
a['type'] = feat.type # (unused by auspice)
if ref_seq_name:
a['seqid'] = ref_seq_name # (unused by auspice)
return a

annotations = {}
for fname, feat in features.items():
annotations[fname] = _parse(feat)
if fname=='nuc':
assert annotations['nuc']['strand'] == '+', "Nuc feature must be +ve strand"
elif annotations[fname]['strand'] not in ['+', '-']:
print("WARNING: Feature {fname:q} uses a strand which auspice cannot display")

return annotations
Loading