From d6246ca052478446f7179e230e842a34f93e4cd4 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Wed, 16 Aug 2023 16:28:42 +1200 Subject: [PATCH 1/3] Update annotations schema to be correct The annotations schema is updated to more accurately represent how this data is used by Auspice version 2.46.1. Specifically, the strand of the 'nuc' property is not used by auspice, and start/end coordinates are required properties. Tests are updated because the tests were incorrect. Currently the annotations section of node-data JSONs are produced by `augur ancestral` and `augur translate`. Each adds a 'nuc' property, and it's required for Auspice, so I think it's appropriate to make it a required property for the schema. The changes to the schema here are only reflected when parsing node-data files, the next commit will make this available for our typical auspice JSON validation. --- augur/data/schema-annotations.json | 50 ++++++++++++++--------- tests/util_support/test_node_data_file.py | 2 +- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/augur/data/schema-annotations.json b/augur/data/schema-annotations.json index e2a09eaca..7e09b9b0e 100644 --- a/augur/data/schema-annotations.json +++ b/augur/data/schema-annotations.json @@ -1,34 +1,44 @@ { "type" : "object", "$schema": "http://json-schema.org/draft-06/schema#", - "title": "JSON object for the `annotations` key, typically produced by `augur translate`", - "description": "Coordinates etc of genes / genome", + "title": "Schema for the 'annotations' property (node-data JSON) or the 'genome_annotations' property (auspice JSON)", + "properties": { + "nuc": { + "type": "object", + "allOf": [{ "$ref": "#/$defs/startend" }], + "additionalProperties": true, + "$comment": "All other properties are unused by Auspice. Strand is always considered to be positive." + } + }, + "required": ["nuc"], "patternProperties": { "^[a-zA-Z0-9*_-]+$": { "type": "object", + "allOf": [{ "$ref": "#/$defs/startend" }], + "additionalProperties": true, "properties": { - "seqid":{ - "description": "Sequence on which the coordinates below are valid. Could be viral segment, bacterial contig, etc", - "$comment": "Unused by Auspice 2.0", - "type": "string" - }, - "type": { - "description": "Type of the feature. could be mRNA, CDS, or similar", - "$comment": "Unused by Auspice 2.0", + "strand": { + "description": "Is the gene on the positive ('+') or negative ('-') strand.", + "$comment": "Auspice assumes positive strand unless strand is '-'", "type": "string" - }, + } + } + } + }, + "$defs": { + "startend": { + "type": "object", + "required": ["start", "end"], + "properties": { "start": { - "description": "Gene start position (one-based, following GFF format)", - "type": "number" + "type": "integer", + "minimum": 1, + "description": "Start position (one-based, following GFF format)" }, "end": { - "description": "Gene end position (one-based closed, last position of feature, following GFF format)", - "type": "number" - }, - "strand": { - "description": "Positive or negative strand", - "type": "string", - "enum": ["-","+"] + "type": "integer", + "minimum": 2, + "description": "End position (one-based, following GFF format). This value _must_ be greater than the start." } } } diff --git a/tests/util_support/test_node_data_file.py b/tests/util_support/test_node_data_file.py index 89c4fbd7e..b59af366a 100644 --- a/tests/util_support/test_node_data_file.py +++ b/tests/util_support/test_node_data_file.py @@ -38,7 +38,7 @@ def test_validate_valid(self, build_node_data_file): build_node_data_file( f""" {{ - "annotations": {{ "a": {{ "start": 5 }} }}, + "annotations": {{ "nuc": {{ "start": 1, "end": 100 }} }}, "generated_by": {{ "program": "augur", "version": "{__version__}" }}, "nodes": {{ "a": 5 }} }} From 37f63b481ae4b491b940e03d5f090bf8d09c4d00 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Wed, 16 Aug 2023 18:57:18 +1200 Subject: [PATCH 2/3] [validate] allow local $refs See added comments for details --- augur/data/schema-annotations.json | 1 + augur/data/schema-export-v2.json | 39 +----------------------------- augur/validate.py | 35 ++++++++++++++++++++++++--- 3 files changed, 34 insertions(+), 41 deletions(-) diff --git a/augur/data/schema-annotations.json b/augur/data/schema-annotations.json index 7e09b9b0e..18d3bf9c9 100644 --- a/augur/data/schema-annotations.json +++ b/augur/data/schema-annotations.json @@ -1,6 +1,7 @@ { "type" : "object", "$schema": "http://json-schema.org/draft-06/schema#", + "$id": "https://nextstrain.org/schemas/augur/annotations", "title": "Schema for the 'annotations' property (node-data JSON) or the 'genome_annotations' property (auspice JSON)", "properties": { "nuc": { diff --git a/augur/data/schema-export-v2.json b/augur/data/schema-export-v2.json index edf5b7408..3545062df 100644 --- a/augur/data/schema-export-v2.json +++ b/augur/data/schema-export-v2.json @@ -51,44 +51,7 @@ } }, "genome_annotations": { - "description": "Genome annotations (e.g. genes), relative to the reference genome", - "$comment": "Required for the entropy panel", - "type": "object", - "required": ["nuc"], - "additionalProperties": false, - "properties": { - "nuc": { - "type": "object", - "properties": { - "seqid":{ - "description": "Sequence on which the coordinates below are valid. Could be viral segment, bacterial contig, etc", - "$comment": "currently unused by Auspice", - "type": "string" - }, - "type": { - "description": "Type of the feature. could be mRNA, CDS, or similar", - "$comment": "currently unused by Auspice", - "type": "string" - }, - "start": { - "description": "Gene start position (one-based, following GFF format)", - "type": "number" - }, - "end": { - "description": "Gene end position (one-based closed, last position of feature, following GFF format)", - "type": "number" - }, - "strand": { - "description": "Positive or negative strand", - "type": "string", - "enum": ["-","+"] - } - } - } - }, - "patternProperties": { - "^[a-zA-Z0-9*_-]+$": {"$ref": "#/properties/meta/properties/genome_annotations/properties/nuc"} - } + "$ref": "https://nextstrain.org/schemas/augur/annotations" }, "filters": { "description": "These appear as filters in the footer of Auspice (which populates the displayed values based upon the tree)", diff --git a/augur/validate.py b/augur/validate.py index 3c42c7b8f..7d5eedf4c 100644 --- a/augur/validate.py +++ b/augur/validate.py @@ -25,7 +25,7 @@ class ValidateError(Exception): pass -def load_json_schema(path): +def load_json_schema(path, refs=None): ''' Load a JSON schema from the augur included set of schemas (located in augur/data) @@ -40,7 +40,28 @@ def load_json_schema(path): Validator.check_schema(schema) except jsonschema.exceptions.SchemaError as err: raise ValidateError(f"Schema {path} is not a valid JSON Schema ({Validator.META_SCHEMA['$schema']}). Error: {err}") - return Validator(schema) + + if refs: + # Make the validator aware of additional schemas + schema_store = {k: json.loads(resource_string(__package__, os.path.join("data", v))) for k,v in refs.items()} + resolver = jsonschema.RefResolver.from_schema(schema,store=schema_store) + schema_validator = Validator(schema, resolver=resolver) + else: + schema_validator = Validator(schema) + + # By default $ref URLs which we don't define in a schema_store are fetched + # by jsonschema. This often indicates a typo (the $ref doesn't match the key + # of the schema_store) or we forgot to add a local mapping for a new $ref. + # Either way, Augur should not be accessing the network. + def resolve_remote(url): + # The exception type is not important as jsonschema will catch & re-raise as a RefResolutionError + raise Exception(f"The schema used for validation attempted to fetch the remote URL '{url!r}'. " + + "Augur should resolve schema references to local files, please check the schema used " + + "and update the appropriate schema_store as needed." ) + schema_validator.resolver.resolve_remote = resolve_remote + + return schema_validator + def load_json(path): with open(path, 'rb') as fh: @@ -163,7 +184,15 @@ def auspice_config_v2(config_json, **kwargs): validate(config, schema, config_json) def export_v2(main_json, **kwargs): - main_schema = load_json_schema("schema-export-v2.json") + # The main_schema uses references to other schemas, and the suggested use is + # to define these refs as valid URLs. Augur itself should not access schemas + # over the wire so we provide a mapping between URLs and filepaths here. The + # filepath is specified relative to ./augur/data (where all the schemas + # live). + refs = { + 'https://nextstrain.org/schemas/augur/annotations': "schema-annotations.json" + } + main_schema = load_json_schema("schema-export-v2.json", refs) if main_json.endswith("frequencies.json") or main_json.endswith("entropy.json") or main_json.endswith("sequences.json"): raise ValidateError("This validation subfunction is for the main `augur export v2` JSON only.") From c530539889d0f1fe9193e086447462420a7a62ca Mon Sep 17 00:00:00 2001 From: james hadfield Date: Wed, 16 Aug 2023 21:46:36 +1200 Subject: [PATCH 3/3] Extend annotations schema See for the context for these additions. --- augur/data/schema-annotations.json | 57 ++++++++++++++++++++--- tests/test_validate.py | 73 +++++++++++++++++++++++++++++- 2 files changed, 123 insertions(+), 7 deletions(-) diff --git a/augur/data/schema-annotations.json b/augur/data/schema-annotations.json index 18d3bf9c9..37a9075ac 100644 --- a/augur/data/schema-annotations.json +++ b/augur/data/schema-annotations.json @@ -7,21 +7,52 @@ "nuc": { "type": "object", "allOf": [{ "$ref": "#/$defs/startend" }], + "properties": { + "start": { + "enum": [1], + "$comment": "nuc must begin at 1" + }, + "strand": { + "type": "string", + "enum":["+"], + "description": "Strand is optional for nuc, as it should be +ve for all genomes (-ve strand genomes are reverse complemented)", + "$comment": "Auspice will not proceed if the JSON has strand='-'" + } + }, "additionalProperties": true, - "$comment": "All other properties are unused by Auspice. Strand is always considered to be positive." + "$comment": "All other properties are unused by Auspice." } }, "required": ["nuc"], "patternProperties": { - "^[a-zA-Z0-9*_-]+$": { + "^(?!nuc)[a-zA-Z0-9*_-]+$": { + "$comment": "Each object here defines a single CDS", "type": "object", - "allOf": [{ "$ref": "#/$defs/startend" }], + "oneOf": [{ "$ref": "#/$defs/startend" }, { "$ref": "#/$defs/segments" }], "additionalProperties": true, + "required": ["strand"], "properties": { + "gene": { + "type": "string", + "description": "The name of the gene the CDS is from. Optional.", + "$comment": "Shown in on-hover infobox & influences default CDS colors" + }, "strand": { - "description": "Is the gene on the positive ('+') or negative ('-') strand.", - "$comment": "Auspice assumes positive strand unless strand is '-'", - "type": "string" + "description": "Strand of the CDS", + "type": "string", + "enum": ["-", "+"] + }, + "color": { + "type": "string", + "description": "A CSS color or a color hex code. Optional." + }, + "display_name": { + "type": "string", + "$comment": "Shown in the on-hover info box" + }, + "description": { + "type": "string", + "$comment": "Shown in the on-hover info box" } } } @@ -42,6 +73,20 @@ "description": "End position (one-based, following GFF format). This value _must_ be greater than the start." } } + }, + "segments": { + "type": "object", + "required": ["segments"], + "properties": { + "segments": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "allOf": [{ "$ref": "#/$defs/startend" }] + } + } + } } } } diff --git a/tests/test_validate.py b/tests/test_validate.py index 427e3947d..ef1efe161 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -4,7 +4,10 @@ from augur.validate import ( validate_collection_config_fields, validate_collection_display_defaults, - validate_measurements_config + validate_measurements_config, + load_json_schema, + validate_json, + ValidateError ) @@ -88,3 +91,71 @@ def test_validate_measurements_config_invalid_default_collection(self, example_m } assert not validate_measurements_config(measurements) assert capsys.readouterr().err == "ERROR: The default collection key 'invalid_collection' does not match any of the collections' keys.\n" + + +@pytest.fixture +def genome_annotation_schema(): + return load_json_schema("schema-annotations.json") + +class TestValidateGenomeAnnotations(): + def test_negative_strand_nuc(self, capsys, genome_annotation_schema): + d = {"nuc": {"start": 1, "end": 200, "strand": "-"}} + with pytest.raises(ValidateError): + validate_json(d, genome_annotation_schema, "") + capsys.readouterr() # suppress validation error printing + + def test_nuc_not_starting_at_one(self, capsys, genome_annotation_schema): + d = {"nuc": {"start": 100, "end": 200, "strand": "+"}} + with pytest.raises(ValidateError): + validate_json(d, genome_annotation_schema, "") + capsys.readouterr() # suppress validation error printing + + def test_missing_nuc(self, capsys, genome_annotation_schema): + d = {"cds": {"start": 100, "end": 200, "strand": "+"}} + with pytest.raises(ValidateError): + validate_json(d, genome_annotation_schema, "") + capsys.readouterr() # suppress validation error printing + + def test_missing_properties(self, capsys, genome_annotation_schema): + d = {"nuc": {"start": 1, "end": 100}, "cds": {"start": 20, "strand": "+"}} + with pytest.raises(ValidateError): + validate_json(d, genome_annotation_schema, "") + capsys.readouterr() # suppress validation error printing + + def test_not_stranded_cds(self, capsys, genome_annotation_schema): + # Strand . is for features that are not stranded (as per GFF spec), and thus they're not CDSs + d = {"nuc": {"start": 1, "end": 100}, "cds": {"start": 18, "end": 20, "strand": "."}} + with pytest.raises(ValidateError): + validate_json(d, genome_annotation_schema, "") + capsys.readouterr() # suppress validation error printing + + def test_negative_coordinates(self, capsys, genome_annotation_schema): + d = {"nuc": {"start": 1, "end": 100}, "cds": {"start": -2, "end": 10, "strand": "+"}} + with pytest.raises(ValidateError): + validate_json(d, genome_annotation_schema, "") + capsys.readouterr() # suppress validation error printing + + def test_valid_genome(self, capsys, genome_annotation_schema): + d = {"nuc": {"start": 1, "end": 100}, "cds": {"start": 20, "end": 28, "strand": "+"}} + validate_json(d, genome_annotation_schema, "") + capsys.readouterr() # suppress validation error printing + + def test_valid_segmented_genome(self, capsys, genome_annotation_schema): + d = {"nuc": {"start": 1, "end": 100}, + "cds": {"segments": [{"start": 20, "end": 28}], "strand": "+"}} + validate_json(d, genome_annotation_schema, "") + capsys.readouterr() # suppress validation error printing + + def test_invalid_segmented_genome(self, capsys, genome_annotation_schema): + d = {"nuc": {"start": 1, "end": 100}, + "cds": {"segments": [{"start": 20, "end": 28}, {"start": 27}], "strand": "+"}} + with pytest.raises(ValidateError): + validate_json(d, genome_annotation_schema, "") + capsys.readouterr() # suppress validation error printing + + def test_string_coordinates(self, capsys, genome_annotation_schema): + d = {"nuc": {"start": 1, "end": 100}, + "cds": {"segments": [{"start": 20, "end": 28}, {"start": "27", "end": "29"}], "strand": "+"}} + with pytest.raises(ValidateError): + validate_json(d, genome_annotation_schema, "") + capsys.readouterr() # suppress validation error printing \ No newline at end of file