From e099711e934fba8d053ce175277f5d4791396474 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Fri, 15 Mar 2024 13:35:58 +1300 Subject: [PATCH] Write compound (segmented) sequence locations Given a SeqFeqture with a CompoundLocation we now correctly write out the CDS/gene using segmented coordinates. Auspice can now handle such coordinates (see and for the corresponding schema updates). Note that the translations (via augur translate) of complex CDSs did not need modifying as they already used BioPython's SeqFeature.extract method. Supersedes #1333 --- CHANGES.md | 2 ++ augur/ancestral.py | 12 +++-------- augur/translate.py | 22 +++---------------- augur/utils.py | 54 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 62 insertions(+), 28 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index dbedcb5f0..549d2186d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -12,12 +12,14 @@ * filter: Updated docs with an example of tiered subsampling. [#1425][] (@victorlin) * export: Fixes bug [#1433] introduced in v23.1.0, that causes validation to fail when gene names start with `nuc`, e.g. `nucleocapsid`. [#1434][] (@corneliusroemer) * import: Fixes bug introduced in v24.2.0 that prevented `import beast` from running. [#1439][] (@tomkinsc) +* translate, ancestral: Compound CDS are now exported as segmented CDS and are now viewable in Auspice. [#1438][] (@jameshadfield) [#1425]: https://github.com/nextstrain/augur/pull/1425 [#1429]: https://github.com/nextstrain/augur/pull/1429 [#1433]: https://github.com/nextstrain/augur/issues/1433 [#1434]: https://github.com/nextstrain/augur/pull/1434 [#1436]: https://github.com/nextstrain/augur/pull/1436 +[#1438]: https://github.com/nextstrain/augur/pull/1438 [#1439]: https://github.com/nextstrain/augur/pull/1439 ## 24.2.3 (23 February 2024) diff --git a/augur/ancestral.py b/augur/ancestral.py index 2771948eb..ebf96c905 100644 --- a/augur/ancestral.py +++ b/augur/ancestral.py @@ -28,7 +28,8 @@ from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord -from .utils import parse_genes_argument, read_tree, InvalidTreeError, write_json, get_json_name +from .utils import parse_genes_argument, read_tree, InvalidTreeError, write_json, get_json_name, \ + genome_features_to_auspice_annotation from .io.file import open_file from .io.vcf import is_vcf as is_filename_vcf from treetime.vcf_utils import read_vcf, write_vcf @@ -455,14 +456,7 @@ def run(args): node["aa_sequences"][gene] = aa_result['tt'].sequence(T.root, as_string=True, reconstructed=True) anc_seqs['reference'][gene] = aa_result['root_seq'] - # FIXME: Note that this is calculating the end of the CDS as 3*length of translation - # this is equivalent to the annotation for single segment CDS, but not for cds - # with splicing and slippage. But auspice can't handle the latter at the moment. - anc_seqs['annotations'][gene] = {'seqid':args.annotation, - 'type':feat.type, - 'start': int(feat.location.start)+1, - 'end': int(feat.location.start) + 3*len(anc_seqs['reference'][gene]), - 'strand': {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]} + anc_seqs['annotations'].update(genome_features_to_auspice_annotation({gene: feat}, args.annotation)) # Save ancestral amino acid sequences to FASTA. if args.output_translations: diff --git a/augur/translate.py b/augur/translate.py index 45ab1c4e0..d7ce30ef6 100644 --- a/augur/translate.py +++ b/augur/translate.py @@ -17,7 +17,8 @@ import numpy as np from Bio import SeqIO, Seq, SeqRecord, Phylo from .io.vcf import write_VCF_translation, is_vcf as is_filename_vcf -from .utils import parse_genes_argument, read_node_data, load_features, write_json, get_json_name +from .utils import parse_genes_argument, read_node_data, load_features, \ + write_json, get_json_name, genome_features_to_auspice_annotation from treetime.vcf_utils import read_vcf from augur.errors import AugurError from textwrap import dedent @@ -446,24 +447,7 @@ def run(args): continue reference_translations[fname] = safe_translate(str(feat.extract(reference))) - ## glob the annotations for later auspice export - # - # Note that BioPython FeatureLocations use - # "Pythonic" coordinates: [zero-origin, half-open) - # Starting with augur v6 we use GFF coordinates: [one-origin, inclusive] - annotations = { - 'nuc': {'start': features['nuc'].location.start+1, - 'end': features['nuc'].location.end, - 'strand': '+', - 'type': features['nuc'].type, # (unused by auspice) - 'seqid': args.reference_sequence} # (unused by auspice) - } - for fname, feat in features.items(): - annotations[fname] = {'seqid':args.reference_sequence, - 'type':feat.type, - 'start':int(feat.location.start)+1, - 'end':int(feat.location.end), - 'strand': {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]} + annotations = genome_features_to_auspice_annotation(features, args.reference_sequence, assert_nuc=True) ## determine amino acid mutations for each node try: diff --git a/augur/utils.py b/augur/utils.py index bd1a9ccd7..c65ac384a 100644 --- a/augur/utils.py +++ b/augur/utils.py @@ -832,3 +832,57 @@ def _get_genes_from_file(fname): print("Read in {} specified genes to translate.".format(len(unique_genes))) return unique_genes + + + +def genome_features_to_auspice_annotation(features, ref_seq_name=None, assert_nuc=False): + """ + Parameters + ---------- + features : dict + keys: feature names, values: Bio.SeqFeature.SeqFeature objects + ref_seq_name : str (optional) + Exported as the `seqid` for each feature. Note this is unused by Auspice + assert_nuc : bool (optional) + If true, one of the feature key names must be "nuc" + + Returns + ------- + annotations: dict + See schema-annotations.json for the schema this conforms to + + """ + from Bio.SeqFeature import SimpleLocation, CompoundLocation + + if assert_nuc and 'nuc' not in features: + raise AugurError("Genome features must include a feature for 'nuc'") + + def _parse(feat): + a = {} + # Note that BioPython locations use "Pythonic" coordinates: [zero-origin, half-open) + # Starting with augur v6 we use GFF coordinates: [one-origin, inclusive] + if type(feat.location)==SimpleLocation: + a['start'] = int(feat.location.start)+1 + a['end'] = int(feat.location.end) + elif type(feat.location)==CompoundLocation: + a['segments'] = [ + {'start':int(segment.start)+1, 'end':int(segment.end)} + for segment in feat.location.parts # segment: SimpleLocation + ] + else: + raise AugurError(f"Encountered a genome feature with an unknown location type {type(feat.location):q}") + a['strand'] = {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand] + a['type'] = feat.type # (unused by auspice) + if ref_seq_name: + a['seqid'] = ref_seq_name # (unused by auspice) + return a + + annotations = {} + for fname, feat in features.items(): + annotations[fname] = _parse(feat) + if fname=='nuc': + assert annotations['nuc']['strand'] == '+', "Nuc feature must be +ve strand" + elif annotations[fname]['strand'] not in ['+', '-']: + print("WARNING: Feature {fname:q} uses a strand which auspice cannot display") + + return annotations