Skip to content

Commit ddba55a

Browse files
committed
[export] allow multiple trees
Multiple trees ("subtrees") have been available in Auspice since late 2021¹ and part of the associated schema since early 2022². Despite this there was no way to produce such datasets within Augur itself, and despite the schema changes the associated `augur validate` command was never updated to allow them. This commit adds multi-tree inputs to `augur export v2` as well as allowing them to validate with our associated validation commands. ¹ <nextstrain/auspice#1442> ² <#851>
1 parent 14e32ed commit ddba55a

File tree

3 files changed

+100
-17
lines changed

3 files changed

+100
-17
lines changed

Diff for: augur/export_v2.py

+19-9
Original file line numberDiff line numberDiff line change
@@ -118,13 +118,13 @@ def order_nodes(node):
118118
return od
119119

120120

121-
def read_tree(fname):
122-
tree = Phylo.read(fname, 'newick')
121+
def read_trees(filenames):
122+
trees = [Phylo.read(fname, 'newick') for fname in filenames]
123123
# augur export requires unique node names (both terminal and external) as these
124124
# are used to associate metadata/node-data with nodes. Any duplication is fatal.
125125
# The exception to this is unlabelled node names, which auspice will handle but
126126
# won't be associated with any metadata within export.
127-
node_names = [clade.name for clade in tree.root.find_clades()]
127+
node_names = [clade.name for tree in trees for clade in tree.root.find_clades()]
128128
if None in node_names:
129129
raise AugurError(f"Tree contains unnamed nodes. If these are internal nodes you may wish to run "+
130130
"`augur refine --tree <newick> --output-tree <newick>` to name them.")
@@ -133,7 +133,7 @@ def read_tree(fname):
133133
dups = [name for name, count in Counter(node_names).items() if count>1]
134134
raise AugurError(f"{len(dups)} node names occur multiple times in the tree: " +
135135
", ".join([f"{v!r}" for v in dups[0:5]]) + ("..." if len(dups)>5 else ""))
136-
return (tree, node_names)
136+
return (trees, node_names)
137137

138138

139139
def node_div(T, node_attrs):
@@ -751,7 +751,12 @@ def _recursively_set_data(node):
751751
node['branch_attrs'] = branch_attrs[node['name']]
752752
for child in node.get("children", []):
753753
_recursively_set_data(child)
754-
_recursively_set_data(data_json["tree"])
754+
755+
if isinstance(data_json["tree"], list):
756+
for subtree in data_json['tree']:
757+
_recursively_set_data(subtree)
758+
else:
759+
_recursively_set_data(data_json["tree"])
755760

756761

757762
def set_node_attrs_on_tree(data_json, node_attrs, additional_metadata_columns):
@@ -840,7 +845,11 @@ def _recursively_set_data(node):
840845
for child in node.get("children", []):
841846
_recursively_set_data(child)
842847

843-
_recursively_set_data(data_json["tree"])
848+
if isinstance(data_json["tree"], list):
849+
for subtree in data_json['tree']:
850+
_recursively_set_data(subtree)
851+
else:
852+
_recursively_set_data(data_json["tree"])
844853

845854
def node_data_prop_is_normal_trait(name):
846855
# those traits / keys / attrs which are not "special" and can be exported
@@ -894,7 +903,7 @@ def register_parser(parent_subparsers):
894903
required = parser.add_argument_group(
895904
title="REQUIRED"
896905
)
897-
required.add_argument('--tree','-t', metavar="newick", required=True, help="Phylogenetic tree, usually output from `augur refine`")
906+
required.add_argument('--tree','-t', metavar="newick", nargs='+', action='extend', required=True, help="Phylogenetic tree(s), usually output from `augur refine`")
898907
required.add_argument('--output', metavar="JSON", required=True, help="Output file (typically for visualisation in auspice)")
899908

900909
config = parser.add_argument_group(
@@ -1192,7 +1201,7 @@ def run(args):
11921201
metadata_file = {}
11931202

11941203
# parse input files
1195-
(T, node_names) = read_tree(args.tree)
1204+
(trees, node_names) = read_trees(args.tree)
11961205
node_data, node_attrs, node_data_names, metadata_names, branch_attrs = \
11971206
parse_node_data_and_metadata(node_names, node_data_file, metadata_file)
11981207
config = get_config(args)
@@ -1224,7 +1233,8 @@ def run(args):
12241233
set_filters(data_json, config)
12251234

12261235
# set tree structure
1227-
data_json["tree"] = convert_tree_to_json_structure(T.root, node_attrs, node_div(T, node_attrs))
1236+
trees_json = [convert_tree_to_json_structure(tree.root, node_attrs, node_div(tree, node_attrs)) for tree in trees]
1237+
data_json["tree"] = trees_json[0] if len(trees_json)==1 else trees_json
12281238
set_node_attrs_on_tree(data_json, node_attrs, additional_metadata_columns)
12291239
set_branch_attrs_on_tree(data_json, branch_attrs)
12301240

Diff for: augur/validate_export.py

+28-8
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import sys
88
from collections import defaultdict
99

10-
def ensure_no_duplicate_names(root, ValidateError):
10+
def ensure_no_duplicate_names(tree, ValidateError):
1111
"""
1212
Check that all node names are identical, which is required for auspice (v2) JSONs.
1313
"""
@@ -18,10 +18,14 @@ def recurse(node):
1818
names.add(node["name"])
1919
if "children" in node:
2020
[recurse(child) for child in node["children"]]
21-
recurse(root)
21+
if isinstance(tree, list):
22+
for subtree in tree:
23+
recurse(subtree)
24+
else:
25+
recurse(tree)
2226

2327

24-
def collectTreeAttrsV2(root, warn):
28+
def collectTreeAttrsV2(tree, warn):
2529
"""
2630
Collect all keys specified on `node["node_attrs"]` throughout the tree
2731
and the values associated with them. Note that this will only look at
@@ -47,7 +51,12 @@ def recurse(node):
4751
[recurse(child) for child in node["children"]]
4852
else:
4953
num_terminal += 1
50-
recurse(root)
54+
55+
if isinstance(tree, list):
56+
for subtree in tree:
57+
recurse(subtree)
58+
else:
59+
recurse(tree)
5160

5261
for data in seen.values():
5362
if data["count"] == num_nodes:
@@ -56,7 +65,7 @@ def recurse(node):
5665
return(seen, num_terminal)
5766

5867

59-
def collectMutationGenes(root):
68+
def collectMutationGenes(tree):
6069
"""
6170
Returns a set of all genes specified in the tree in the "aa_muts" objects
6271
"""
@@ -67,17 +76,28 @@ def recurse(node):
6776
genes.update(mutations.keys())
6877
if "children" in node:
6978
[recurse(child) for child in node["children"]]
70-
recurse(root)
79+
80+
if isinstance(tree, list):
81+
for subtree in tree:
82+
recurse(subtree)
83+
else:
84+
recurse(tree)
85+
7186
genes -= {"nuc"}
7287
return genes
7388

74-
def collectBranchLabels(root):
89+
def collectBranchLabels(tree):
7590
labels = set()
7691
def recurse(node):
7792
labels.update(node.get("branch_attrs", {}).get("labels", {}).keys())
7893
if "children" in node:
7994
[recurse(child) for child in node["children"]]
80-
recurse(root)
95+
96+
if isinstance(tree, list):
97+
for subtree in tree:
98+
recurse(subtree)
99+
else:
100+
recurse(tree)
81101
return labels
82102

83103
def verifyMainJSONIsInternallyConsistent(data, ValidateError):

Diff for: tests/functional/export_v2/cram/multi-tree.t

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
2+
Setup
3+
4+
$ source "$TESTDIR"/_setup.sh
5+
6+
Create a small second tree (which has different names/labels than 'tree.nwk')
7+
$ cat > tree2.nwk <<~~
8+
> (tipG:1,(tipH:1,tipI:1)internalHI:2)SECOND_ROOT:0;
9+
> ~~
10+
11+
Minimal export -- no node data, no metadata etc etc
12+
$ ${AUGUR} export v2 \
13+
> --tree "$TESTDIR/../data/tree.nwk" tree2.nwk \
14+
> --output minimal.json &> /dev/null
15+
16+
More realistic export - with node_data for all nodes and metadata for some of them
17+
18+
$ cat > metadata.tsv <<~~
19+
> strain something
20+
> tipA foo
21+
> tipB foo
22+
> tipC foo
23+
> tipG bar
24+
> tipH bar
25+
> ~~
26+
27+
28+
$ cat > node-data.json <<~~
29+
> {
30+
> "nodes": {
31+
> "ROOT": {"mutation_length": 0},
32+
> "tipA": {"mutation_length": 1},
33+
> "internalBC": {"mutation_length": 2},
34+
> "tipB": {"mutation_length": 1},
35+
> "tipC": {"mutation_length": 1},
36+
> "internalDEF": {"mutation_length": 5},
37+
> "tipD": {"mutation_length": 3},
38+
> "tipE": {"mutation_length": 4},
39+
> "tipF": {"mutation_length": 1},
40+
> "SECOND_ROOT": {"mutation_length": 0},
41+
> "tipG": {"mutation_length": 1},
42+
> "internalHI": {"mutation_length": 2},
43+
> "tipH": {"mutation_length": 1},
44+
> "tipI": {"mutation_length": 1}
45+
> }
46+
> }
47+
> ~~
48+
49+
$ ${AUGUR} export v2 \
50+
> --tree "$TESTDIR/../data/tree.nwk" tree2.nwk \
51+
> --metadata metadata.tsv --color-by-metadata something \
52+
> --node-data node-data.json \
53+
> --output output.json &> /dev/null

0 commit comments

Comments
 (0)