From 62d763fac733bfb7fcd58bb8fb238e1adaba63ec Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Tue, 14 Nov 2023 13:46:05 +0100 Subject: [PATCH] feat: initial implementation of ACMG rules for DEL/loss (#1) (#9) --- .gitattributes | 4 + .github/workflows/rust.yml | 15 + Cargo.toml | 22 +- src/common/mod.rs | 41 + src/main.rs | 4 +- src/strucvars/data/clingen_dosage/io.rs | 292 +++++ src/strucvars/data/clingen_dosage/mod.rs | 391 ++++++ ..._test__load_clingen_genes_file_grch37.snap | 40 + ...est__load_clingen_regions_file_grch37.snap | 40 + ...est__load_clingen_regions_file_grch38.snap | 40 + src/strucvars/data/decipher_hi.rs | 128 ++ src/strucvars/data/gnomad/io.rs | 254 ++++ src/strucvars/data/gnomad/mod.rs | 155 +++ ...rs__data__gnomad__io__test__load_file.snap | 130 ++ ...ucvars__data__gnomad__test__data_load.snap | 8 + ...ucvars__data__gnomad__test__load_file.snap | 20 + src/strucvars/data/hgnc.rs | 180 +++ src/strucvars/data/intervals.rs | 102 ++ src/strucvars/data/mod.rs | 7 + ...s__data__decipher_hi__test__data_load.snap | 9 + ...cvars__data__hgnc__test__gene_id_data.snap | 9 + ...trucvars__data__hgnc__test__load_file.snap | 25 + src/strucvars/ds.rs | 63 + src/strucvars/eval/common.rs | 16 + src/strucvars/eval/del/mod.rs | 106 ++ src/strucvars/eval/del/result.rs | 520 ++++++++ src/strucvars/eval/del/section1.rs | 138 +++ src/strucvars/eval/del/section2.rs | 1061 +++++++++++++++++ src/strucvars/eval/del/section3.rs | 132 ++ src/strucvars/eval/del/section4.rs | 67 ++ ...__test__clingen_overlaps@gene-abcd1-2.snap | 6 + ...n2__test__clingen_overlaps@gene-abcd1.snap | 19 + ...test__clingen_overlaps@region-match-2.snap | 12 + ...__test__clingen_overlaps@region-match.snap | 6 + ...l__section2__test__evaluate@APOB-2C-1.snap | 15 + ...l__section2__test__evaluate@APOB-2C-2.snap | 15 + ...l__section2__test__evaluate@APOB-2D-1.snap | 14 + ...l__section2__test__evaluate@APOB-2D-3.snap | 15 + ...l__section2__test__evaluate@APOB-2D-4.snap | 18 + ...del__section2__test__evaluate@APOB-2E.snap | 11 + ..._section2__test__evaluate@AVCRL2-2D-2.snap | 20 + ...ction2__test__evaluate@COL16A1-2B-neg.snap | 6 + ...tion2__test__evaluate@COL3A1-2A-neg-1.snap | 6 + ...tion2__test__evaluate@COL3A1-2A-neg-2.snap | 6 + ...ection2__test__evaluate@COL3A1-2A-pos.snap | 17 + ..._section2__test__evaluate@COL3A1-2C-1.snap | 15 + ..._section2__test__evaluate@COL3A1-2C-2.snap | 15 + ..._section2__test__evaluate@COL3A1-2D-1.snap | 14 + ..._section2__test__evaluate@COL3A1-2D-3.snap | 15 + ..._section2__test__evaluate@COL3A1-2D-4.snap | 18 + ...l__section2__test__evaluate@COL3A1-2E.snap | 11 + ...2__test__evaluate@ISCA-46303-2A-neg-1.snap | 6 + ...2__test__evaluate@ISCA-46303-2A-neg-2.snap | 6 + ...2__test__evaluate@ISCA-46303-2A-neg-3.snap | 6 + ...on2__test__evaluate@ISCA-46303-2A-pos.snap | 17 + ...2__test__evaluate@ISCA-46311-2F-neg-1.snap | 6 + ...2__test__evaluate@ISCA-46311-2F-neg-2.snap | 6 + ...on2__test__evaluate@ISCA-46311-2F-pos.snap | 16 + ...2__test__evaluate@ISCA-46311-2G-pos-1.snap | 16 + ...2__test__evaluate@ISCA-46311-2G-pos-2.snap | 16 + ..._section2__test__evaluate@MFN2-2H-neg.snap | 6 + ...ction2__test__evaluate@REV3L-2H-neg-1.snap | 6 + ...ction2__test__evaluate@REV3L-2H-neg-2.snap | 6 + ...ction2__test__evaluate@REV3L-2H-pos-1.snap | 20 + ...ction2__test__evaluate@REV3L-2H-pos-2.snap | 20 + ...ction2__test__evaluate@REV3L-2H-pos-3.snap | 20 + ...ion2__test__get_gene_hi_predictions-2.snap | 16 + ...ction2__test__get_gene_hi_predictions.snap | 16 + ...2__test__handle_case_2a@COL3A1-2A-pos.snap | 17 + ...est__handle_case_2a@ISCA-46303-2A-pos.snap | 17 + ...tion2__test__handle_case_2c@APOB-2C-1.snap | 13 + ...tion2__test__handle_case_2c@APOB-2C-2.snap | 13 + ...on2__test__handle_case_2c@COL3A1-2C-1.snap | 13 + ...on2__test__handle_case_2c@COL3A1-2C-2.snap | 13 + ...section2__test__handle_case_2d@2D-1-2.snap | 12 + ...__section2__test__handle_case_2d@2D-1.snap | 12 + ...section2__test__handle_case_2d@2D-3-2.snap | 13 + ...__section2__test__handle_case_2d@2D-3.snap | 13 + ...section2__test__handle_case_2d@2D-4-2.snap | 16 + ...__section2__test__handle_case_2d@2D-4.snap | 16 + ...ection2__test__handle_case_2e@APOB-2E.snap | 9 + ...tion2__test__handle_case_2e@COL3A1-2E.snap | 9 + ..._handle_case_2f@ISCA-46311-2F-neg-1-2.snap | 6 + ..._handle_case_2f@ISCA-46311-2F-neg-1-3.snap | 6 + ...t__handle_case_2f@ISCA-46311-2F-neg-1.snap | 6 + ..._handle_case_2f@ISCA-46311-2F-neg-2-2.snap | 6 + ..._handle_case_2f@ISCA-46311-2F-neg-2-3.snap | 6 + ...t__handle_case_2f@ISCA-46311-2F-neg-2.snap | 6 + ...t__handle_case_2f@ISCA-46311-2F-pos-2.snap | 12 + ...t__handle_case_2f@ISCA-46311-2F-pos-3.snap | 16 + ...est__handle_case_2f@ISCA-46311-2F-pos.snap | 6 + ...on2__test__handle_case_2h@MFN2-2H-neg.snap | 6 + ...__test__handle_case_2h@REV3L-2H-neg-1.snap | 6 + ...__test__handle_case_2h@REV3L-2H-neg-2.snap | 6 + ...__test__handle_case_2h@REV3L-2H-pos-1.snap | 20 + ...__test__handle_case_2h@REV3L-2H-pos-2.snap | 20 + ...__test__handle_case_2h@REV3L-2H-pos-3.snap | 20 + ...2__test__handle_cases_2c_2e@APOB-2C-1.snap | 13 + ...2__test__handle_cases_2c_2e@APOB-2C-2.snap | 13 + ...2__test__handle_cases_2c_2e@APOB-2D-1.snap | 12 + ...2__test__handle_cases_2c_2e@APOB-2D-3.snap | 13 + ...2__test__handle_cases_2c_2e@APOB-2D-4.snap | 16 + ...on2__test__handle_cases_2c_2e@APOB-2E.snap | 9 + ..._test__handle_cases_2c_2e@AVCRL2-2D-2.snap | 18 + ..._test__handle_cases_2c_2e@COL3A1-2C-1.snap | 13 + ..._test__handle_cases_2c_2e@COL3A1-2C-2.snap | 13 + ..._test__handle_cases_2c_2e@COL3A1-2D-1.snap | 12 + ..._test__handle_cases_2c_2e@COL3A1-2D-3.snap | 13 + ..._test__handle_cases_2c_2e@COL3A1-2D-4.snap | 16 + ...2__test__handle_cases_2c_2e@COL3A1-2E.snap | 9 + ...al__del__section3__test__evaluate_l1b.snap | 10 + ...l__test__evaluate@1-11937319-11944386.snap | 12 + ...l__test__evaluate@1-12050913-12054733.snap | 29 + ...l__test__evaluate@1-12098550-12103898.snap | 12 + src/strucvars/eval/dup/mod.rs | 4 + src/strucvars/eval/mod.rs | 264 ++++ src/strucvars/mod.rs | 17 +- tests/data/README.md | 4 + tests/data/hgnc.tsv | 3 + .../ClinGen_gene_curation_list_GRCh37.tsv | 3 + .../ClinGen_region_curation_list_GRCh37.tsv | 3 + .../ClinGen_region_curation_list_GRCh38.tsv | 3 + tests/data/strucvars/README.md | 4 + tests/data/strucvars/bootstrap.sh | 17 + .../data/strucvars/decipher_hi_prediction.tsv | 3 + tests/data/strucvars/gnomad_constraints.tsv | 3 + tests/data/strucvars/hi/README.md | 48 + tests/data/strucvars/hi/bootstrap.sh | 65 + .../hi/clinvar-empty/rocksdb/000014.sst | 3 + .../hi/clinvar-empty/rocksdb/CURRENT | 3 + .../hi/clinvar-empty/rocksdb/IDENTITY | 3 + .../strucvars/hi/clinvar-empty/rocksdb/LOCK | 0 .../strucvars/hi/clinvar-empty/rocksdb/LOG | 3 + .../hi/clinvar-empty/rocksdb/MANIFEST-000005 | 3 + .../hi/clinvar-empty/rocksdb/OPTIONS-000009 | 3 + .../hi/clinvar-empty/rocksdb/OPTIONS-000011 | 3 + .../strucvars/hi/clinvar/rocksdb/000014.sst | 3 + .../strucvars/hi/clinvar/rocksdb/000016.sst | 3 + .../data/strucvars/hi/clinvar/rocksdb/CURRENT | 3 + .../strucvars/hi/clinvar/rocksdb/IDENTITY | 3 + tests/data/strucvars/hi/clinvar/rocksdb/LOCK | 0 tests/data/strucvars/hi/clinvar/rocksdb/LOG | 3 + .../hi/clinvar/rocksdb/MANIFEST-000005 | 3 + .../hi/clinvar/rocksdb/OPTIONS-000009 | 3 + .../hi/clinvar/rocksdb/OPTIONS-000011 | 3 + tests/data/strucvars/hi/tx-mane.tsv | 3 + .../data/strucvars/hi/txs_example_hi.bin.zst | 3 + .../hi/txs_example_hi.bin.zst.report | 3 + tests/data/strucvars/rec/README.md | 5 + tests/data/strucvars/ts/README.md | 1 + 150 files changed, 5561 insertions(+), 5 deletions(-) create mode 100644 .gitattributes create mode 100644 src/strucvars/data/clingen_dosage/io.rs create mode 100644 src/strucvars/data/clingen_dosage/mod.rs create mode 100644 src/strucvars/data/clingen_dosage/snapshots/scarus__strucvars__data__clingen_dosage__io__test__load_clingen_genes_file_grch37.snap create mode 100644 src/strucvars/data/clingen_dosage/snapshots/scarus__strucvars__data__clingen_dosage__io__test__load_clingen_regions_file_grch37.snap create mode 100644 src/strucvars/data/clingen_dosage/snapshots/scarus__strucvars__data__clingen_dosage__io__test__load_clingen_regions_file_grch38.snap create mode 100644 src/strucvars/data/decipher_hi.rs create mode 100644 src/strucvars/data/gnomad/io.rs create mode 100644 src/strucvars/data/gnomad/mod.rs create mode 100644 src/strucvars/data/gnomad/snapshots/scarus__strucvars__data__gnomad__io__test__load_file.snap create mode 100644 src/strucvars/data/gnomad/snapshots/scarus__strucvars__data__gnomad__test__data_load.snap create mode 100644 src/strucvars/data/gnomad/snapshots/scarus__strucvars__data__gnomad__test__load_file.snap create mode 100644 src/strucvars/data/hgnc.rs create mode 100644 src/strucvars/data/intervals.rs create mode 100644 src/strucvars/data/mod.rs create mode 100644 src/strucvars/data/snapshots/scarus__strucvars__data__decipher_hi__test__data_load.snap create mode 100644 src/strucvars/data/snapshots/scarus__strucvars__data__hgnc__test__gene_id_data.snap create mode 100644 src/strucvars/data/snapshots/scarus__strucvars__data__hgnc__test__load_file.snap create mode 100644 src/strucvars/ds.rs create mode 100644 src/strucvars/eval/common.rs create mode 100644 src/strucvars/eval/del/mod.rs create mode 100644 src/strucvars/eval/del/result.rs create mode 100644 src/strucvars/eval/del/section1.rs create mode 100644 src/strucvars/eval/del/section2.rs create mode 100644 src/strucvars/eval/del/section3.rs create mode 100644 src/strucvars/eval/del/section4.rs create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__clingen_overlaps@gene-abcd1-2.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__clingen_overlaps@gene-abcd1.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__clingen_overlaps@region-match-2.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__clingen_overlaps@region-match.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@APOB-2C-1.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@APOB-2C-2.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@APOB-2D-1.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@APOB-2D-3.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@APOB-2D-4.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@APOB-2E.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@AVCRL2-2D-2.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL16A1-2B-neg.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2A-neg-1.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2A-neg-2.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2A-pos.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2C-1.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2C-2.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2D-1.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2D-3.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2D-4.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2E.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46303-2A-neg-1.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46303-2A-neg-2.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46303-2A-neg-3.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46303-2A-pos.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46311-2F-neg-1.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46311-2F-neg-2.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46311-2F-pos.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46311-2G-pos-1.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46311-2G-pos-2.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@MFN2-2H-neg.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@REV3L-2H-neg-1.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@REV3L-2H-neg-2.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@REV3L-2H-pos-1.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@REV3L-2H-pos-2.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@REV3L-2H-pos-3.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__get_gene_hi_predictions-2.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__get_gene_hi_predictions.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2a@COL3A1-2A-pos.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2a@ISCA-46303-2A-pos.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2c@APOB-2C-1.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2c@APOB-2C-2.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2c@COL3A1-2C-1.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2c@COL3A1-2C-2.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2d@2D-1-2.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2d@2D-1.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2d@2D-3-2.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2d@2D-3.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2d@2D-4-2.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2d@2D-4.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2e@APOB-2E.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2e@COL3A1-2E.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-neg-1-2.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-neg-1-3.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-neg-1.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-neg-2-2.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-neg-2-3.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-neg-2.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-pos-2.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-pos-3.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-pos.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2h@MFN2-2H-neg.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2h@REV3L-2H-neg-1.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2h@REV3L-2H-neg-2.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2h@REV3L-2H-pos-1.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2h@REV3L-2H-pos-2.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2h@REV3L-2H-pos-3.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@APOB-2C-1.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@APOB-2C-2.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@APOB-2D-1.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@APOB-2D-3.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@APOB-2D-4.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@APOB-2E.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@AVCRL2-2D-2.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@COL3A1-2C-1.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@COL3A1-2C-2.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@COL3A1-2D-1.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@COL3A1-2D-3.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@COL3A1-2D-4.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@COL3A1-2E.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section3__test__evaluate_l1b.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__test__evaluate@1-11937319-11944386.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__test__evaluate@1-12050913-12054733.snap create mode 100644 src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__test__evaluate@1-12098550-12103898.snap create mode 100644 src/strucvars/eval/dup/mod.rs create mode 100644 src/strucvars/eval/mod.rs create mode 100644 tests/data/README.md create mode 100644 tests/data/hgnc.tsv create mode 100644 tests/data/strucvars/ClinGen_gene_curation_list_GRCh37.tsv create mode 100644 tests/data/strucvars/ClinGen_region_curation_list_GRCh37.tsv create mode 100644 tests/data/strucvars/ClinGen_region_curation_list_GRCh38.tsv create mode 100644 tests/data/strucvars/README.md create mode 100644 tests/data/strucvars/bootstrap.sh create mode 100644 tests/data/strucvars/decipher_hi_prediction.tsv create mode 100644 tests/data/strucvars/gnomad_constraints.tsv create mode 100644 tests/data/strucvars/hi/README.md create mode 100644 tests/data/strucvars/hi/bootstrap.sh create mode 100644 tests/data/strucvars/hi/clinvar-empty/rocksdb/000014.sst create mode 100644 tests/data/strucvars/hi/clinvar-empty/rocksdb/CURRENT create mode 100644 tests/data/strucvars/hi/clinvar-empty/rocksdb/IDENTITY create mode 100644 tests/data/strucvars/hi/clinvar-empty/rocksdb/LOCK create mode 100644 tests/data/strucvars/hi/clinvar-empty/rocksdb/LOG create mode 100644 tests/data/strucvars/hi/clinvar-empty/rocksdb/MANIFEST-000005 create mode 100644 tests/data/strucvars/hi/clinvar-empty/rocksdb/OPTIONS-000009 create mode 100644 tests/data/strucvars/hi/clinvar-empty/rocksdb/OPTIONS-000011 create mode 100644 tests/data/strucvars/hi/clinvar/rocksdb/000014.sst create mode 100644 tests/data/strucvars/hi/clinvar/rocksdb/000016.sst create mode 100644 tests/data/strucvars/hi/clinvar/rocksdb/CURRENT create mode 100644 tests/data/strucvars/hi/clinvar/rocksdb/IDENTITY create mode 100644 tests/data/strucvars/hi/clinvar/rocksdb/LOCK create mode 100644 tests/data/strucvars/hi/clinvar/rocksdb/LOG create mode 100644 tests/data/strucvars/hi/clinvar/rocksdb/MANIFEST-000005 create mode 100644 tests/data/strucvars/hi/clinvar/rocksdb/OPTIONS-000009 create mode 100644 tests/data/strucvars/hi/clinvar/rocksdb/OPTIONS-000011 create mode 100644 tests/data/strucvars/hi/tx-mane.tsv create mode 100644 tests/data/strucvars/hi/txs_example_hi.bin.zst create mode 100644 tests/data/strucvars/hi/txs_example_hi.bin.zst.report create mode 100644 tests/data/strucvars/rec/README.md create mode 100644 tests/data/strucvars/ts/README.md diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..2504352 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,4 @@ +tests/**/hgnc.tsv filter=lfs diff=lfs merge=lfs -text +tests/**/*.tsv filter=lfs diff=lfs merge=lfs -text +tests/**/*.bin* filter=lfs diff=lfs merge=lfs -text +tests/**/rocksdb/** filter=lfs diff=lfs merge=lfs -text diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index e8c8a70..770f12b 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -22,6 +22,11 @@ jobs: override: true components: rustfmt + - name: Setup protoc + uses: arduino/setup-protoc@v1.1.2 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + - name: Check format run: | cargo fmt -- --check @@ -39,6 +44,11 @@ jobs: override: true components: clippy + - name: Setup protoc + uses: arduino/setup-protoc@v1.1.2 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + - name: Lint with clippy uses: actions-rs/clippy-check@v1 with: @@ -59,6 +69,11 @@ jobs: toolchain: stable override: true + - name: Setup protoc + uses: arduino/setup-protoc@v1.1.2 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + - uses: Swatinem/rust-cache@v2 - name: Run cargo-tarpaulin diff --git a/Cargo.toml b/Cargo.toml index fdf741d..562d3c4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,21 +13,39 @@ name = "scarus" path = "src/main.rs" [dependencies] +annonars = "0.24.5" anyhow = "1.0" +bio = "1.4.0" +biocommons-bioutils = "0.1.4" clap-verbosity-flag = "2.0" clap = { version = "4.4", features = ["derive", "help"] } +csv = "1.3.0" +displaydoc = "0.2.4" +hgvs = "0.13.2" +itertools = "0.11.0" log = "0.4" +mehari = "0.17" +rustc-hash = "1.1" +rustix = "=0.38.21" # TODO: remove pin +serde_json = "1.0" +serde = { version = "1.0", features = ["serde_derive"] } +serde_with = { version = "3.3", features = ["indexmap_2"] } thiserror = "1.0" tokio = { version = "1.33", features = ["full"] } tracing = "0.1" tracing-subscriber = "0.3" +rocksdb = { version = "0.21.0", features = ["multi-threaded-cf"] } +prost = "0.12.1" [dev-dependencies] anyhow = "1.0" +assert-eq-float = "0.1.3" clap-verbosity-flag = {version = "2.0"} clap = { version = "4.1", features = ["derive", "env"] } env_logger = "0.10" +insta = { version = "1.34", features = ["yaml"] } pretty_assertions = "1.3" +rstest = "0.18.2" +serde_test = "1.0" temp_testdir = "0.2" -test-log = "0.2" -tracing-subscriber = {version = "0.3" } +tracing-test = "0.2" diff --git a/src/common/mod.rs b/src/common/mod.rs index 9c5ffa9..5ca7206 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -10,3 +10,44 @@ pub struct Args { #[clap(flatten)] pub verbose: Verbosity, } + +/// Assembly to be passed on the command line. +#[derive( + Debug, + Clone, + Copy, + PartialEq, + Eq, + PartialOrd, + Ord, + clap::ValueEnum, + serde::Deserialize, + serde::Serialize, +)] +pub enum Assembly { + /// GRCh37 + Grch37, + /// GRCh38 + Grch38, +} + +impl From for biocommons_bioutils::assemblies::Assembly { + fn from(val: Assembly) -> Self { + match val { + // GRCh37p10 is the first one with chrMT. + Assembly::Grch37 => biocommons_bioutils::assemblies::Assembly::Grch37p10, + // All canonical contigs including chrMT are in GRCh38. + Assembly::Grch38 => biocommons_bioutils::assemblies::Assembly::Grch38, + } + } +} + +impl From for Assembly { + fn from(val: biocommons_bioutils::assemblies::Assembly) -> Self { + match val { + biocommons_bioutils::assemblies::Assembly::Grch37 => Assembly::Grch37, + biocommons_bioutils::assemblies::Assembly::Grch37p10 => Assembly::Grch37, + biocommons_bioutils::assemblies::Assembly::Grch38 => Assembly::Grch38, + } + } +} diff --git a/src/main.rs b/src/main.rs index 171dc9d..2ab5c10 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,9 +1,9 @@ //! Main entry point for Scarus application. -#![deny(clippy::pedantic)] +// #![deny(clippy::pedantic)] #![allow(clippy::must_use_candidate)] #![allow(clippy::module_name_repetitions)] -#![warn(missing_docs)] +// #![warn(missing_docs)] use clap::{Parser, Subcommand}; diff --git a/src/strucvars/data/clingen_dosage/io.rs b/src/strucvars/data/clingen_dosage/io.rs new file mode 100644 index 0000000..a48217a --- /dev/null +++ b/src/strucvars/data/clingen_dosage/io.rs @@ -0,0 +1,292 @@ +//! I/O implementation for `ClinGen` dosage sensitivity. + +use std::{ + io::{BufRead, BufReader}, + path::Path, +}; + +use crate::common::Assembly; + +/// Helper trait to adjust the region to the chromosome. +pub trait AdjustForAssembly { + /// Adjust the region to the chromosome. + fn for_assembly(self, assembly: Assembly) -> Self; +} + +/// `ClinGen` gene dosage sensitivity TSV entry. +#[derive(Debug, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct Gene { + /// Gene symbol. + #[serde(alias = "#Gene Symbol")] + pub gene_symbol: String, + /// NCBI gene ID. + #[serde(alias = "Gene ID")] + pub ncbi_gene_id: String, + /// Genomic location. + #[serde(alias = "Genomic Location")] + pub genomic_location: String, + /// Haploinsufficiency score. + #[serde(alias = "Haploinsufficiency Score", deserialize_with = "parse_score")] + pub haploinsufficiency_score: Option, + /// Triplosensitivity score. + #[serde(alias = "Triplosensitivity Score", deserialize_with = "parse_score")] + pub triplosensitivity_score: Option, + /// Haploinsufficiency Disease ID. + #[serde(alias = "Haploinsufficiency Disease ID")] + pub haploinsufficiency_disease_id: Option, + /// Haploinsufficiency Disease ID. + #[serde(alias = "Triplosensitivity Disease ID")] + pub triplosensitivity_disease_id: Option, +} + +impl TryInto for Gene { + type Error = anyhow::Error; + + fn try_into(self) -> Result { + Ok(super::Gene { + gene_symbol: self.gene_symbol, + ncbi_gene_id: self.ncbi_gene_id, + genomic_location: self.genomic_location, + haploinsufficiency_score: super::Score::try_from(self.haploinsufficiency_score) + .map_err(|e| anyhow::anyhow!("invalid haploinsufficiency score: {}", e))?, + triplosensitivity_score: super::Score::try_from(self.triplosensitivity_score) + .map_err(|e| anyhow::anyhow!("invalid haploinsufficiency score: {}", e))?, + haploinsufficiency_disease_id: self.haploinsufficiency_disease_id, + triplosensitivity_disease_id: self.triplosensitivity_disease_id, + }) + } +} + +impl AdjustForAssembly for Gene { + fn for_assembly(self, assembly: Assembly) -> Self { + Self { + genomic_location: if assembly == Assembly::Grch37 { + self.genomic_location.replace("chr", "") + } else { + self.genomic_location + }, + ..self + } + } +} + +/// `ClinGen` region dosage sensitivity entry. +#[derive(Debug, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct Region { + /// ISCA ID + #[serde(alias = "#ISCA ID")] + pub isca_id: String, + /// ISCA Region Name + #[serde(alias = "ISCA Region Name")] + pub isca_region_name: String, + /// Genomic location. + #[serde(alias = "Genomic Location")] + pub genomic_location: String, + /// Haploinsufficiency score. + #[serde(alias = "Haploinsufficiency Score", deserialize_with = "parse_score")] + pub haploinsufficiency_score: Option, + /// Triplosensitivity score. + #[serde(alias = "Triplosensitivity Score", deserialize_with = "parse_score")] + pub triplosensitivity_score: Option, + /// Haploinsufficiency Disease ID. + #[serde(alias = "Haploinsufficiency Disease ID")] + pub haploinsufficiency_disease_id: Option, + /// Haploinsufficiency Disease ID. + #[serde(alias = "Triplosensitivity Disease ID")] + pub triplosensitivity_disease_id: Option, +} + +impl TryInto for Region { + type Error = anyhow::Error; + + fn try_into(self) -> Result { + Ok(super::Region { + isca_id: self.isca_id, + isca_region_name: self.isca_region_name, + genomic_location: self.genomic_location, + haploinsufficiency_score: super::Score::try_from(self.haploinsufficiency_score) + .map_err(|e| anyhow::anyhow!("invalid haploinsufficiency score: {}", e))?, + triplosensitivity_score: super::Score::try_from(self.triplosensitivity_score) + .map_err(|e| anyhow::anyhow!("invalid haploinsufficiency score: {}", e))?, + haploinsufficiency_disease_id: self.haploinsufficiency_disease_id, + triplosensitivity_disease_id: self.triplosensitivity_disease_id, + }) + } +} + +impl AdjustForAssembly for Region { + fn for_assembly(self, assembly: Assembly) -> Self { + Self { + genomic_location: if assembly == Assembly::Grch37 { + self.genomic_location.replace("chr", "") + } else { + self.genomic_location + }, + ..self + } + } +} + +/// Helper for parsing the scores which may have interesting values. +fn parse_score<'de, D>(d: D) -> Result, D::Error> +where + D: serde::Deserializer<'de>, +{ + let tmp: String = serde::Deserialize::deserialize(d)?; + if tmp.is_empty() || tmp == "Not yet evaluated" || tmp == "-1" { + Ok(None) + } else { + Ok(Some(tmp.parse().map_err(serde::de::Error::custom)?)) + } +} + +/// Load `ClinGen` regions file. +/// +/// # Arguments +/// +/// * `path` - Path to `ClinGen` regions file. +/// * `assembly` - Assembly of the regions file, used to determine whether or not to +/// strip the `chr` prefix from the chromosome name. +/// +/// # Returns +/// +/// `ClinGen` regions. +/// +/// # Errors +/// +/// If anything goes wrong, it returns a generic `anyhow::Error`. +pub fn load_file(path: P, assembly: Assembly) -> Result, anyhow::Error> +where + T: serde::de::DeserializeOwned + std::fmt::Debug + AdjustForAssembly, + P: AsRef, +{ + // Construct reader and skip initial 5 lines. + let mut reader = std::fs::File::open(path) + .map_err(|e| anyhow::anyhow!("problem opening file: {}", e)) + .map(BufReader::new)?; + + { + let mut buf = String::new(); + for _ in 0..5 { + reader.read_line(&mut buf)?; + buf.clear(); + } + } + + let mut csv_reader = csv::ReaderBuilder::new() + .delimiter(b'\t') + .has_headers(true) + .flexible(true) + .from_reader(reader); + let mut result = Vec::new(); + for record in csv_reader.deserialize() { + let record: T = record.map_err(|e| anyhow::anyhow!("problem parsing record: {}", e))?; + result.push(record.for_assembly(assembly)); + } + + Ok(result) +} + +#[cfg(test)] +mod test { + use crate::common::Assembly; + + #[test] + fn load_clingen_genes_file_grch37() -> Result<(), anyhow::Error> { + let genes = super::load_file::( + "tests/data/strucvars/ClinGen_gene_curation_list_GRCh37.tsv", + Assembly::Grch37, + )?; + + assert_eq!(genes.len(), 1518); + insta::assert_yaml_snapshot!(&genes[0..5]); + + Ok(()) + } + + #[test] + fn load_clingen_regions_file_grch37() -> Result<(), anyhow::Error> { + let regions = super::load_file::( + "tests/data/strucvars/ClinGen_region_curation_list_GRCh37.tsv", + Assembly::Grch37, + )?; + + assert_eq!(regions.len(), 514); + insta::assert_yaml_snapshot!(®ions[0..5]); + + Ok(()) + } + + #[test] + fn load_clingen_regions_file_grch38() -> Result<(), anyhow::Error> { + let regions = super::load_file::( + "tests/data/strucvars/ClinGen_region_curation_list_GRCh38.tsv", + Assembly::Grch38, + )?; + + assert_eq!(regions.len(), 514); + insta::assert_yaml_snapshot!(®ions[0..5]); + + Ok(()) + } + + #[test] + fn gene_conversion() -> Result<(), anyhow::Error> { + let io_gene = super::Gene { + gene_symbol: "AASS".to_string(), + ncbi_gene_id: "53947".to_string(), + genomic_location: "chr7:121713598-121784344".to_string(), + haploinsufficiency_score: Some(30), + triplosensitivity_score: None, + haploinsufficiency_disease_id: Some("MONDO:0009388".to_string()), + triplosensitivity_disease_id: None, + }; + + let gene: super::super::Gene = io_gene.try_into()?; + + assert_eq!( + gene, + super::super::Gene { + gene_symbol: "AASS".to_string(), + ncbi_gene_id: "53947".to_string(), + genomic_location: "chr7:121713598-121784344".to_string(), + haploinsufficiency_score: super::super::Score::GeneAssociatedWithRecessivePhenotype, + triplosensitivity_score: super::super::Score::NoEvidenceAvailable, + haploinsufficiency_disease_id: Some("MONDO:0009388".to_string()), + triplosensitivity_disease_id: None, + } + ); + + Ok(()) + } + + #[test] + fn region_conversion() -> Result<(), anyhow::Error> { + let io_region = super::Region { + isca_id: "ISCA-46748".to_string(), + isca_region_name: "Xq25 region (includes STAG2)".to_string(), + genomic_location: "chrX:123034319-123236519".to_string(), + haploinsufficiency_score: Some(3), + triplosensitivity_score: Some(3), + haploinsufficiency_disease_id: Some("MONDO:0100038".to_string()), + triplosensitivity_disease_id: None, + }; + + let region: super::super::Region = io_region.try_into()?; + + assert_eq!( + region, + super::super::Region { + isca_id: "ISCA-46748".to_string(), + isca_region_name: "Xq25 region (includes STAG2)".to_string(), + genomic_location: "chrX:123034319-123236519".to_string(), + haploinsufficiency_score: super::super::Score::SufficientEvidence, + triplosensitivity_score: super::super::Score::SufficientEvidence, + haploinsufficiency_disease_id: Some("MONDO:0100038".to_string()), + triplosensitivity_disease_id: None, + } + ); + + Ok(()) + } +} diff --git a/src/strucvars/data/clingen_dosage/mod.rs b/src/strucvars/data/clingen_dosage/mod.rs new file mode 100644 index 0000000..eccd505 --- /dev/null +++ b/src/strucvars/data/clingen_dosage/mod.rs @@ -0,0 +1,391 @@ +//! Data structures and I/O code for `ClinGen` dosage sensitivity. + +use std::path::Path; + +use bio::{ + bio_types::genome::AbstractInterval as _, + data_structures::interval_tree::ArrayBackedIntervalTree, +}; +use itertools::Itertools as _; + +use crate::common::Assembly; + +use super::{ + hgnc, + intervals::{do_overlap, Interval}, +}; + +pub mod io; + +/// Haploinsufficiency scores. +#[derive( + Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, serde::Deserialize, serde::Serialize, +)] +pub enum Score { + /// Sufficient evidence for dosage pathogenicity (3) + SufficientEvidence, + /// Little evidence for dosage pathogenicity (1) + LittleEvidence, + /// Some evidence for dosage pathogenicity (2) + SomeEvidence, + /// No evidence for dosage pathogenicity (0) + NoEvidenceAvailable, + /// Gene associated with autosomal recessive phenotype (30) + GeneAssociatedWithRecessivePhenotype, + /// Dosage sensitivity unlikely (40) + DosageSensitivityUnlikely, +} + +impl TryFrom> for Score { + type Error = anyhow::Error; + + fn try_from(value: Option) -> Result { + match value { + None | Some(0) => Ok(Self::NoEvidenceAvailable), + Some(1) => Ok(Self::LittleEvidence), + Some(2) => Ok(Self::SomeEvidence), + Some(3) => Ok(Self::SufficientEvidence), + Some(30) => Ok(Self::GeneAssociatedWithRecessivePhenotype), + Some(40) => Ok(Self::DosageSensitivityUnlikely), + _ => anyhow::bail!("invalid score: {:?}", value), + } + } +} + +/// `ClinGen` dosage sensitivy gene record to be used in the app. +#[derive(Debug, Clone, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct Gene { + /// Gene symbol. + pub gene_symbol: String, + /// NCBI gene ID. + pub ncbi_gene_id: String, + /// Genomic location. + pub genomic_location: String, + /// Haploinsufficiency score. + pub haploinsufficiency_score: Score, + /// Triplosensitivity score. + pub triplosensitivity_score: Score, + /// Haploinsufficiency Disease ID. + pub haploinsufficiency_disease_id: Option, + /// Haploinsufficiency Disease ID. + pub triplosensitivity_disease_id: Option, +} + +/// Helper to convert genomic location string into an interval. +fn genomic_location_to_interval( + genomic_location: &str, +) -> Result { + let mut parts = genomic_location.split(':'); + let chrom = parts.next().ok_or_else(|| { + anyhow::anyhow!( + "could not parse chromosome from genomic location: {}", + genomic_location + ) + })?; + let mut parts = parts + .next() + .ok_or_else(|| anyhow::anyhow!("could not parse region {}", genomic_location))? + .split('-'); + let begin = parts + .next() + .unwrap() + .parse::() + .map_err(|e| anyhow::anyhow!("could not parse start position from: {}", e))? + .saturating_sub(1); + let end = parts + .next() + .unwrap() + .parse::() + .map_err(|e| anyhow::anyhow!("could not parse end position from: {}", e))?; + Ok(bio::bio_types::genome::Interval::new( + chrom.to_string(), + begin..end, + )) +} + +impl TryInto for Gene { + type Error = anyhow::Error; + + fn try_into(self) -> Result { + genomic_location_to_interval(&self.genomic_location) + } +} + +/// `ClinGen` dosage sensitivy region record to be used in the app. +#[derive(Debug, Clone, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct Region { + /// ISCA ID + pub isca_id: String, + /// ISCA Region Name + pub isca_region_name: String, + /// Genomic locaion. + pub genomic_location: String, + /// Haploinsufficiency score. + pub haploinsufficiency_score: Score, + /// Triplosensitivity score. + pub triplosensitivity_score: Score, + /// Haploinsufficiency Disease ID. + pub haploinsufficiency_disease_id: Option, + /// Haploinsufficiency Disease ID. + pub triplosensitivity_disease_id: Option, +} + +impl TryInto for Region { + type Error = anyhow::Error; + + fn try_into(self) -> Result { + genomic_location_to_interval(&self.genomic_location) + } +} + +/// Facade struct for querying the `ClinGen` dosage information. +#[derive(Debug, Clone)] +pub struct Data { + /// Information regarding genes. + genes: Vec, + /// Mapping from HGNC gene ID to index in `genes`. + hgnc_to_genes_idx: rustc_hash::FxHashMap, + /// Interval tree index for querying genes, one per chromosome. + genes_trees: rustc_hash::FxHashMap>, + + /// Information on regions. + regions: Vec, +} + +impl Data { + /// Construct `Info` from paths to `ClinGen` dosage files. + /// + /// # Arguments + /// + /// * `gene_path` - Path to the `ClinGen_gene_curation_list_GRCh37.tsv` file. + /// * `region_path` - Path to the `ClinGen_region_curation_list_GRCh37.tsv` file. + /// * `hgnc_data` - `hgnc::Data` to build mapping from HGNC ID to NCBI gene ID. + /// * `assembly` - Assembly that will be used (for adjusting chromosome names). + /// + /// # Returns + /// + /// `Info` struct. + /// + /// # Errors + /// + /// If anything goes wrong, it returns a generic `anyhow::Error`. + pub fn new( + gene_path: P1, + region_path: P2, + hgnc_data: &hgnc::Data, + assembly: Assembly, + ) -> Result + where + P1: AsRef, + P2: AsRef, + { + let genes: Vec = io::load_file::(gene_path, assembly) + .map_err(|e| anyhow::anyhow!("problem loading genes: {}", e))? + .into_iter() + .filter(|gene| match gene.genomic_location.as_str() { + "tbd" => { + tracing::warn!( + "skipping gene with genomic location {}: {}", + gene.genomic_location, + gene.gene_symbol + ); + false + } + _ => true, + }) + .map(std::convert::TryInto::try_into) + .collect::, _>>() + .map_err(|e| anyhow::anyhow!("problem converting genes: {}", e))?; + let genes_trees = Self::build_genes_trees(&genes) + .map_err(|e| anyhow::anyhow!("problem building interval trees for genes: {}", e))?; + let hgnc_to_genes_idx = genes + .iter() + .enumerate() + .map(|(idx, gene)| -> Result<(String, usize), anyhow::Error> { + Ok(( + hgnc_data + .by_ncbi_gene_id(&gene.ncbi_gene_id) + .map(|info| info.hgnc_id.clone()) + .ok_or_else(|| { + anyhow::anyhow!( + "could not find HGNC ID for NCBI gene ID {}", + gene.ncbi_gene_id + ) + })?, + idx, + )) + }) + .collect::, _>>()?; + + let regions = io::load_file::(region_path, assembly) + .map_err(|e| anyhow::anyhow!("problem loading regions: {}", e))? + .into_iter() + .filter(|gene| match gene.genomic_location.as_str() { + "tbd" => { + tracing::warn!( + "skipping region with genomic location {}: {}", + gene.genomic_location, + gene.isca_id + ); + false + } + _ => true, + }) + .map(std::convert::TryInto::try_into) + .collect::, _>>() + .map_err(|e| anyhow::anyhow!("problem converting regions: {}", e))?; + + Ok(Self { + genes, + genes_trees, + hgnc_to_genes_idx, + regions, + }) + } + + /// Build interval trees for the genes. + pub fn build_genes_trees( + genes: &[Gene], + ) -> Result>, anyhow::Error> + { + let mut per_contig = genes + .iter() + .enumerate() + .map(|(idx, gene)| { + let interval: Interval = gene.clone().try_into()?; + let contig = interval.contig().to_string(); + let range = interval.range(); + Ok((contig, (range, idx))) + }) + .collect::, _>>() + .map_err(|e: anyhow::Error| { + anyhow::anyhow!("problem converting genes to intervals: {}", e) + })?; + per_contig.sort_by_key(|(contig, _)| contig.clone()); + let per_contig = per_contig + .into_iter() + .group_by(|(contig, _)| contig.clone()) + .into_iter() + .map(|(contig, group)| { + ( + contig, + ArrayBackedIntervalTree::from_iter(group.map(|(_, (range, idx))| (range, idx))), + ) + }) + .collect::>(); + + Ok(rustc_hash::FxHashMap::from_iter(per_contig)) + } + + /// Obtain the gene information for the given gene HGNC ID. + /// + /// # Arguments + /// + /// * `hgnc_id` - HGNC identifier. + /// + /// # Returns + /// + /// The ClinGen gene dosage information for the given `hgnc_id`, if any. + pub fn gene_by_hgnc_id(&self, hgnc_id: &str) -> Option<&Gene> { + self.hgnc_to_genes_idx + .get(hgnc_id) + .map(|idx| &self.genes[*idx]) + } + + /// Obtain overlapping gene information for the given genomic location. + /// + /// # Arguments + /// + /// * `locus` - Genomic location. + /// + /// # Returns + /// + /// The ClinGen gene dosage information overlapping with `locus`. + pub fn gene_by_overlap(&self, locus: &bio::bio_types::genome::Interval) -> Vec { + self.genes_trees + .get(locus.contig()) + .map(|tree| { + tree.find(locus.range()) + .into_iter() + .map(|idx| &self.genes[*idx.data()]) + .cloned() + .collect::>() + }) + .unwrap_or_default() + } + + /// Obtain overlapping region information for the given genomic location. + /// + /// # Arguments + /// + /// * `locus` - Genomic location. + /// + /// # Returns + /// + /// The ClinGen region dosage information overlapping with `locus`. + pub fn region_by_overlap(&self, locus: &bio::bio_types::genome::Interval) -> Vec { + self.regions + .iter() + .filter(|region| { + let region_locus: bio::bio_types::genome::Interval = (*region) + .clone() + .try_into() + .expect("region to interval conversion must succeed"); + do_overlap(locus, ®ion_locus) + }) + .cloned() + .collect() + } +} + +#[cfg(test)] +mod test { + use crate::common::Assembly; + + #[test] + fn info_with_paths() -> Result<(), anyhow::Error> { + let info = super::Data::new( + "tests/data/strucvars/ClinGen_gene_curation_list_GRCh37.tsv", + "tests/data/strucvars/ClinGen_region_curation_list_GRCh37.tsv", + &super::hgnc::Data::new("tests/data/hgnc.tsv")?, + Assembly::Grch37, + )?; + + assert_eq!(info.genes.len(), 1518); + assert_eq!(info.regions.len(), 513); + + Ok(()) + } + + #[test] + fn score_conversion() -> Result<(), anyhow::Error> { + assert_eq!( + super::Score::try_from(Some(0))?, + super::Score::NoEvidenceAvailable + ); + assert_eq!( + super::Score::try_from(Some(1))?, + super::Score::LittleEvidence + ); + assert_eq!(super::Score::try_from(Some(2))?, super::Score::SomeEvidence); + assert_eq!( + super::Score::try_from(Some(3))?, + super::Score::SufficientEvidence + ); + assert_eq!( + super::Score::try_from(Some(30))?, + super::Score::GeneAssociatedWithRecessivePhenotype + ); + assert_eq!( + super::Score::try_from(Some(40))?, + super::Score::DosageSensitivityUnlikely + ); + assert_eq!( + super::Score::try_from(None)?, + super::Score::NoEvidenceAvailable + ); + assert!(super::Score::try_from(Some(4)).is_err()); + + Ok(()) + } +} diff --git a/src/strucvars/data/clingen_dosage/snapshots/scarus__strucvars__data__clingen_dosage__io__test__load_clingen_genes_file_grch37.snap b/src/strucvars/data/clingen_dosage/snapshots/scarus__strucvars__data__clingen_dosage__io__test__load_clingen_genes_file_grch37.snap new file mode 100644 index 0000000..e02665d --- /dev/null +++ b/src/strucvars/data/clingen_dosage/snapshots/scarus__strucvars__data__clingen_dosage__io__test__load_clingen_genes_file_grch37.snap @@ -0,0 +1,40 @@ +--- +source: src/strucvars/data/clingen_dosage/io.rs +expression: "&genes[0..5]" +--- +- gene_symbol: A4GALT + ncbi_gene_id: "53947" + genomic_location: "22:43088121-43117307" + haploinsufficiency_score: 30 + triplosensitivity_score: 0 + haploinsufficiency_disease_id: ~ + triplosensitivity_disease_id: ~ +- gene_symbol: AAGAB + ncbi_gene_id: "79719" + genomic_location: "15:67493013-67547536" + haploinsufficiency_score: 3 + triplosensitivity_score: 0 + haploinsufficiency_disease_id: "MONDO:0007858" + triplosensitivity_disease_id: ~ +- gene_symbol: AARS1 + ncbi_gene_id: "16" + genomic_location: "16:70286297-70323412" + haploinsufficiency_score: 0 + triplosensitivity_score: 0 + haploinsufficiency_disease_id: ~ + triplosensitivity_disease_id: ~ +- gene_symbol: AARS2 + ncbi_gene_id: "57505" + genomic_location: "6:44266463-44281063" + haploinsufficiency_score: 30 + triplosensitivity_score: ~ + haploinsufficiency_disease_id: "MONDO:0014387" + triplosensitivity_disease_id: ~ +- gene_symbol: AASS + ncbi_gene_id: "10157" + genomic_location: "7:121713598-121784344" + haploinsufficiency_score: 30 + triplosensitivity_score: ~ + haploinsufficiency_disease_id: "MONDO:0009388" + triplosensitivity_disease_id: ~ + diff --git a/src/strucvars/data/clingen_dosage/snapshots/scarus__strucvars__data__clingen_dosage__io__test__load_clingen_regions_file_grch37.snap b/src/strucvars/data/clingen_dosage/snapshots/scarus__strucvars__data__clingen_dosage__io__test__load_clingen_regions_file_grch37.snap new file mode 100644 index 0000000..698045b --- /dev/null +++ b/src/strucvars/data/clingen_dosage/snapshots/scarus__strucvars__data__clingen_dosage__io__test__load_clingen_regions_file_grch37.snap @@ -0,0 +1,40 @@ +--- +source: src/strucvars/data/clingen_dosage/io.rs +expression: "®ions[0..5]" +--- +- isca_id: ISCA-46748 + isca_region_name: 19p13.3 Critical Region 1 (CR1) + genomic_location: tbd + haploinsufficiency_score: 0 + triplosensitivity_score: 1 + haploinsufficiency_disease_id: ~ + triplosensitivity_disease_id: "MONDO:0100038" +- isca_id: ISCA-46747 + isca_region_name: "22q11.2 recurrent region (distal type III, E/F-H) (includes SMARCB1)" + genomic_location: "22:23119415-24994433" + haploinsufficiency_score: 3 + triplosensitivity_score: 1 + haploinsufficiency_disease_id: ~ + triplosensitivity_disease_id: ~ +- isca_id: ISCA-46743 + isca_region_name: Xq25 region (includes STAG2) + genomic_location: "X:123034319-123236519" + haploinsufficiency_score: 3 + triplosensitivity_score: 3 + haploinsufficiency_disease_id: ~ + triplosensitivity_disease_id: "MONDO:0100038" +- isca_id: ISCA-46742 + isca_region_name: 7p22.1 region (includes ACTB) + genomic_location: "7:5536848-5799722" + haploinsufficiency_score: 3 + triplosensitivity_score: 1 + haploinsufficiency_disease_id: "MONDO:0000508" + triplosensitivity_disease_id: "MONDO:0100038" +- isca_id: ISCA-46741 + isca_region_name: 9p24.3 Region (includes DMRT1) + genomic_location: "9:1-1982069" + haploinsufficiency_score: 2 + triplosensitivity_score: ~ + haploinsufficiency_disease_id: "MONDO:0002145" + triplosensitivity_disease_id: ~ + diff --git a/src/strucvars/data/clingen_dosage/snapshots/scarus__strucvars__data__clingen_dosage__io__test__load_clingen_regions_file_grch38.snap b/src/strucvars/data/clingen_dosage/snapshots/scarus__strucvars__data__clingen_dosage__io__test__load_clingen_regions_file_grch38.snap new file mode 100644 index 0000000..e816ac1 --- /dev/null +++ b/src/strucvars/data/clingen_dosage/snapshots/scarus__strucvars__data__clingen_dosage__io__test__load_clingen_regions_file_grch38.snap @@ -0,0 +1,40 @@ +--- +source: src/strucvars/data/clingen_dosage/io.rs +expression: "®ions[0..5]" +--- +- isca_id: ISCA-46748 + isca_region_name: 19p13.3 Critical Region 1 (CR1) + genomic_location: tbd + haploinsufficiency_score: 0 + triplosensitivity_score: 1 + haploinsufficiency_disease_id: ~ + triplosensitivity_disease_id: "MONDO:0100038" +- isca_id: ISCA-46747 + isca_region_name: "22q11.2 recurrent region (distal type III, E/F-H) (includes SMARCB1)" + genomic_location: "chr22:22776925-24598466" + haploinsufficiency_score: 3 + triplosensitivity_score: 1 + haploinsufficiency_disease_id: ~ + triplosensitivity_disease_id: ~ +- isca_id: ISCA-46743 + isca_region_name: Xq25 region (includes STAG2) + genomic_location: "chrX:123900469-124102669" + haploinsufficiency_score: 3 + triplosensitivity_score: 3 + haploinsufficiency_disease_id: ~ + triplosensitivity_disease_id: "MONDO:0100038" +- isca_id: ISCA-46742 + isca_region_name: 7p22.1 region (includes ACTB) + genomic_location: "chr7:5497217-5760091" + haploinsufficiency_score: 3 + triplosensitivity_score: 1 + haploinsufficiency_disease_id: "MONDO:0000508" + triplosensitivity_disease_id: "MONDO:0100038" +- isca_id: ISCA-46741 + isca_region_name: 9p24.3 Region (includes DMRT1) + genomic_location: "chr9:883300-918342" + haploinsufficiency_score: 2 + triplosensitivity_score: ~ + haploinsufficiency_disease_id: "MONDO:0002145" + triplosensitivity_disease_id: ~ + diff --git a/src/strucvars/data/decipher_hi.rs b/src/strucvars/data/decipher_hi.rs new file mode 100644 index 0000000..daf7c51 --- /dev/null +++ b/src/strucvars/data/decipher_hi.rs @@ -0,0 +1,128 @@ +//! Haploinsufficiency index from DECIPHER. + +use std::{io::BufReader, path::Path}; + +/// DECIPHER HI prediction. +#[derive(Debug, Clone, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct HiPrediction { + /// HGNC identifier. + pub hgnc_id: String, + /// Official HGNC gene symbol. + pub hgnc_symbol: String, + /// P(HI) prediction from DECIPHER HI. + pub p_hi: f64, + /// Percent HI index. + pub hi_index: f64, +} + +/// Load DECIPHER HI regions file. +/// +/// # Arguments +/// +/// * `path` - Path to DECIPHER HI file. +/// +/// # Returns +/// +/// HI predicitons. +/// +/// # Errors +/// +/// If anything goes wrong, it returns a generic `anyhow::Error`. +pub fn load_file

(path: P) -> Result, anyhow::Error> +where + P: AsRef, +{ + // Construct reader and skip initial 5 lines. + let reader = std::fs::File::open(path) + .map_err(|e| anyhow::anyhow!("problem opening file: {}", e)) + .map(BufReader::new)?; + + let mut csv_reader = csv::ReaderBuilder::new() + .delimiter(b'\t') + .has_headers(true) + .flexible(false) + .from_reader(reader); + let mut result = Vec::new(); + for record in csv_reader.deserialize() { + let record: HiPrediction = + record.map_err(|e| anyhow::anyhow!("problem parsing record: {}", e))?; + result.push(record); + } + + Ok(result) +} + +/// Facade struct that allows easy acccess to the DECIPHER HI data. +#[derive(Debug, Clone)] +pub struct Data { + /// The DECIPHER HI data. + data: Vec, + /// Mapping from HGNC identifier to DECIPHER HI prediction index. + hgnc_to_data_idx: rustc_hash::FxHashMap, +} + +impl Data { + /// Load from file and construct. + pub fn load

(path: P) -> Result + where + P: AsRef, + { + let data = load_file(path)?; + Ok(Self::new(data)) + } + + /// Create a new `Data` object. + /// + /// # Arguments + /// + /// * `data` - The DECIPHER HI data. + /// + /// # Returns + /// + /// A new `Data` object. + pub fn new(data: Vec) -> Self { + let hgnc_to_data_idx = data + .iter() + .enumerate() + .map(|(idx, hi_prediction)| (hi_prediction.hgnc_id.clone(), idx)) + .collect::>(); + Self { + data, + hgnc_to_data_idx, + } + } + + /// Get the DECIPHER HI prediction for the given HGNC identifier. + /// + /// # Arguments + /// + /// * `hgnc_id` - HGNC identifier. + /// + /// # Returns + /// + /// The DECIPHER HI prediction for the given HGNC identifier, if any. + pub fn by_hgnc_id(&self, hgnc_id: &str) -> Option<&HiPrediction> { + self.hgnc_to_data_idx + .get(hgnc_id) + .map(|idx| &self.data[*idx]) + } +} + +#[cfg(test)] +mod test { + #[test] + fn test_load_file() -> Result<(), anyhow::Error> { + let genes = super::load_file("tests/data/strucvars/decipher_hi_prediction.tsv")?; + assert_eq!(genes.len(), 17995); + + Ok(()) + } + + #[test] + fn data_load() -> Result<(), anyhow::Error> { + let data = super::Data::load("tests/data/strucvars/decipher_hi_prediction.tsv")?; + insta::assert_yaml_snapshot!(data.by_hgnc_id("HGNC:1100")); + + Ok(()) + } +} diff --git a/src/strucvars/data/gnomad/io.rs b/src/strucvars/data/gnomad/io.rs new file mode 100644 index 0000000..d0ef04a --- /dev/null +++ b/src/strucvars/data/gnomad/io.rs @@ -0,0 +1,254 @@ +//! I/O code for gnomAD constraints data. + +use std::{io::BufReader, path::Path}; + +use serde::{Deserialize, Deserializer, Serialize, Serializer}; + +/// Deserialize `Option::None` as `"NA"`. +/// +/// cf. https://stackoverflow.com/a/56384732/84349 +fn deserialize_option_na<'de, D, T: Deserialize<'de>>( + deserializer: D, +) -> Result, D::Error> +where + D: Deserializer<'de>, +{ + // We define a local enum type inside of the function because it is untagged, serde will + // deserialize as the first variant that it can. + #[derive(Deserialize)] + #[serde(untagged)] + enum MaybeNA { + // If it can be parsed as Option, it will be.. + Value(Option), + // ... otherwise try parsing as a string. + NAString(String), + } + + // Deserialize into local enum. + let value: MaybeNA = Deserialize::deserialize(deserializer)?; + match value { + // If parsed as T or None, return that. + MaybeNA::Value(value) => Ok(value), + + // Otherwise, if value is string an "n/a", return None (and fail if it is any other + // string) + MaybeNA::NAString(string) => { + if string == "NA" { + Ok(None) + } else { + Err(serde::de::Error::custom("Unexpected string")) + } + } + } +} + +/// Serialize `Option::None` as `"NA"`. +fn serialize_option_na(x: &Option, s: S) -> Result +where + S: Serializer, + T: Serialize, +{ + match x { + Some(x) => s.serialize_some(x), + None => s.serialize_str("NA"), + } +} + +/// A record from the gnomAD constraints database. +#[serde_with::skip_serializing_none] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Record { + /// The Ensembl gene ID. + pub ensembl_gene_id: String, + /// The NCBI gene ID. + pub entrez_id: String, + /// The HGNC gene symbol. + pub gene_symbol: String, + /// The expected number of loss-of-function variants. + #[serde( + serialize_with = "serialize_option_na", + deserialize_with = "deserialize_option_na" + )] + pub exp_lof: Option, + /// The expected number of missense variants. + #[serde( + serialize_with = "serialize_option_na", + deserialize_with = "deserialize_option_na" + )] + pub exp_mis: Option, + /// The expected number of synonymous variants. + #[serde( + serialize_with = "serialize_option_na", + deserialize_with = "deserialize_option_na" + )] + pub exp_syn: Option, + /// The missense-related Z-score. + #[serde( + serialize_with = "serialize_option_na", + deserialize_with = "deserialize_option_na" + )] + pub mis_z: Option, + /// The observed number of loss-of-function variants. + #[serde( + serialize_with = "serialize_option_na", + deserialize_with = "deserialize_option_na" + )] + pub obs_lof: Option, + /// The observed number of missense variants. + #[serde( + serialize_with = "serialize_option_na", + deserialize_with = "deserialize_option_na" + )] + pub obs_mis: Option, + /// The observed number of synonymous variants. + #[serde( + serialize_with = "serialize_option_na", + deserialize_with = "deserialize_option_na" + )] + pub obs_syn: Option, + /// The loss-of-function observed/expected ratio. + #[serde( + serialize_with = "serialize_option_na", + deserialize_with = "deserialize_option_na" + )] + pub oe_lof: Option, + /// The lower bound of the loss-of-function observed/expected ratio. + #[serde( + serialize_with = "serialize_option_na", + deserialize_with = "deserialize_option_na" + )] + pub oe_lof_lower: Option, + /// The upper bound of the loss-of-function observed/expected ratio. + #[serde( + serialize_with = "serialize_option_na", + deserialize_with = "deserialize_option_na" + )] + pub oe_lof_upper: Option, + /// The missense observed/expected ratio. + #[serde( + serialize_with = "serialize_option_na", + deserialize_with = "deserialize_option_na" + )] + pub oe_mis: Option, + /// The lower bound of the missense observed/expected ratio. + #[serde( + serialize_with = "serialize_option_na", + deserialize_with = "deserialize_option_na" + )] + pub oe_mis_lower: Option, + /// The upper bound of the missense observed/expected ratio. + #[serde( + serialize_with = "serialize_option_na", + deserialize_with = "deserialize_option_na" + )] + pub oe_mis_upper: Option, + /// The synonymous observed/expected ratio. + #[serde( + serialize_with = "serialize_option_na", + deserialize_with = "deserialize_option_na" + )] + pub oe_syn: Option, + /// The lower bound of the synonymous observed/expected ratio. + #[serde( + serialize_with = "serialize_option_na", + deserialize_with = "deserialize_option_na" + )] + pub oe_syn_lower: Option, + /// The upper bound of the synonymous observed/expected ratio. + #[serde( + serialize_with = "serialize_option_na", + deserialize_with = "deserialize_option_na" + )] + pub oe_syn_upper: Option, + /// The probability of loss-of-function intolerance (pLI score). + #[serde( + alias = "pLI", + serialize_with = "serialize_option_na", + deserialize_with = "deserialize_option_na" + )] + pub pli: Option, + /// The synonymous-related Z-score. + #[serde( + serialize_with = "serialize_option_na", + deserialize_with = "deserialize_option_na" + )] + pub syn_z: Option, + /// The probability of loss-of-function intolerance (pLI score) from ExAC. + #[serde( + alias = "exac_pLI", + serialize_with = "serialize_option_na", + deserialize_with = "deserialize_option_na" + )] + pub exac_pli: Option, + /// The observed number of loss-of-function variants from ExAC. + #[serde( + serialize_with = "serialize_option_na", + deserialize_with = "deserialize_option_na" + )] + pub exac_obs_lof: Option, + /// The expected number of loss-of-function variants from ExAC. + #[serde( + serialize_with = "serialize_option_na", + deserialize_with = "deserialize_option_na" + )] + pub exac_exp_lof: Option, + /// The loss-of-function observed/expected ratio from ExAC. + #[serde( + serialize_with = "serialize_option_na", + deserialize_with = "deserialize_option_na" + )] + pub exac_oe_lof: Option, +} + +/// Load gnomAD constraints file. +/// +/// # Arguments +/// +/// * `path` - Path to gnomAD constraints file. +/// +/// # Returns +/// +/// gnomAD constraints. +/// +/// # Errors +/// +/// If anything goes wrong, it returns a generic `anyhow::Error`. +pub fn load_file

(path: P) -> Result, anyhow::Error> +where + P: AsRef, +{ + // Construct reader and skip initial 5 lines. + tracing::debug!("opening file: {:?}", path.as_ref()); + let reader = std::fs::File::open(path) + .map_err(|e| anyhow::anyhow!("problem opening file: {}", e)) + .map(BufReader::new)?; + + tracing::debug!("reading records..."); + let mut csv_reader = csv::ReaderBuilder::new() + .delimiter(b'\t') + .has_headers(true) + .flexible(true) + .from_reader(reader); + let mut result = Vec::new(); + for record in csv_reader.deserialize() { + let record: Record = + record.map_err(|e| anyhow::anyhow!("problem parsing record: {}", e))?; + result.push(record); + } + tracing::debug!("read a total of {} records", result.len()); + + Ok(result) +} + +#[cfg(test)] +mod test { + #[test] + fn test_load_file() -> Result<(), anyhow::Error> { + let genes = super::load_file("tests/data/strucvars/gnomad_constraints.tsv")?; + + assert_eq!(genes.len(), 18481); + insta::assert_yaml_snapshot!(&genes[0..5]); + + Ok(()) + } +} diff --git a/src/strucvars/data/gnomad/mod.rs b/src/strucvars/data/gnomad/mod.rs new file mode 100644 index 0000000..648b14d --- /dev/null +++ b/src/strucvars/data/gnomad/mod.rs @@ -0,0 +1,155 @@ +//! gnomAD gene constraints. + +//! Code for accessing gnomAD gene constraints data. + +use std::path::Path; + +use super::hgnc; + +pub mod io; + +/// The HI informative part of gnomAD constraints. +#[serde_with::skip_serializing_none] +#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] +pub struct Record { + /// The HGNC gene identifier. + pub hgnc_id: String, + /// pLI score. + pub pli: Option, + /// The upper bound of the loss-of-function observed/expected CI. + pub oe_lof_upper: Option, +} + +impl Record { + /// Create a new `Record` from `io::Record`. + /// + /// # Arguments + /// + /// * `other` - The `io::Record` to convert. + /// * `hgnc_id` - The HGNC gene identifier. + /// + /// # Returns + /// + /// A new `Record`. + pub fn from_io_record(other: io::Record, hgnc_id: String) -> Self { + Self { + hgnc_id, + pli: other.pli, + oe_lof_upper: other.oe_lof_upper, + } + } +} + +/// Load gnomAD constraints file and map ENSEMBL gene identifiers. +/// +/// # Arguments +/// +/// * `path` - Path to gnomAD constraints file. +/// * `hgnc_data` - `hgnc::Data` to use for gene ID mapping. +/// +/// # Returns +/// +/// gnomAD constraints. +/// +/// # Errors +/// +/// If anything goes wrong, it returns a generic `anyhow::Error`. +pub fn load_file

(path: P, hgnc_data: &hgnc::Data) -> Result, anyhow::Error> +where + P: AsRef, +{ + Ok(io::load_file(path)? + .into_iter() + .flat_map(|record| { + hgnc_data + .by_ensembl_gene_id(&record.ensembl_gene_id) + .map(|gene_id_info| Record::from_io_record(record, gene_id_info.hgnc_id.clone())) + }) + .collect::>()) +} + +/// Facade struct that allows easy acccess to the gnomAD constraint data. +#[derive(Debug, Clone)] +pub struct Data { + /// The gnomAD constraint. + data: Vec, + /// Mapping from HGNC identifier to gnomAD constraint data index. + hgnc_to_data_idx: rustc_hash::FxHashMap, +} + +impl Data { + /// Load from file and construct. + pub fn load

(path: P, hgnc_data: &hgnc::Data) -> Result + where + P: AsRef, + { + let data = load_file(path, hgnc_data)?; + Ok(Self::new(data)) + } + + /// Create a new `Data` object. + /// + /// # Arguments + /// + /// * `data` - The gnomAD constraint record. + /// + /// # Returns + /// + /// A new `Data` object. + pub fn new(data: Vec) -> Self { + let hgnc_to_data_idx = data + .iter() + .enumerate() + .map(|(idx, record)| (record.hgnc_id.clone(), idx)) + .collect::>(); + Self { + data, + hgnc_to_data_idx, + } + } + + /// Get the gnomAD constraint record for the given HGNC identifier. + /// + /// # Arguments + /// + /// * `hgnc_id` - HGNC identifier. + /// + /// # Returns + /// + /// The gnomAD constraint record for the given HGNC identifier, if any. + pub fn by_hgnc_id(&self, hgnc_id: &str) -> Option<&Record> { + self.hgnc_to_data_idx + .get(hgnc_id) + .map(|idx| &self.data[*idx]) + } +} + +#[cfg(test)] +mod test { + #[tracing_test::traced_test] + #[test] + fn test_load_file() -> Result<(), anyhow::Error> { + let records = super::load_file( + "tests/data/strucvars/gnomad_constraints.tsv", + &super::hgnc::Data::new("tests/data/hgnc.tsv")?, + )?; + + assert_eq!(records.len(), 18134); + insta::assert_yaml_snapshot!(&records[0..5]); + + Ok(()) + } + + #[tracing_test::traced_test] + #[test] + fn data_load() -> Result<(), anyhow::Error> { + let data = super::Data::load( + "tests/data/strucvars/gnomad_constraints.tsv", + &super::hgnc::Data::new("tests/data/hgnc.tsv")?, + )?; + + insta::assert_yaml_snapshot!(data.by_hgnc_id("HGNC:5")); + + Ok(()) + } +} diff --git a/src/strucvars/data/gnomad/snapshots/scarus__strucvars__data__gnomad__io__test__load_file.snap b/src/strucvars/data/gnomad/snapshots/scarus__strucvars__data__gnomad__io__test__load_file.snap new file mode 100644 index 0000000..92be4e3 --- /dev/null +++ b/src/strucvars/data/gnomad/snapshots/scarus__strucvars__data__gnomad__io__test__load_file.snap @@ -0,0 +1,130 @@ +--- +source: src/strucvars/data/gnomad/io.rs +expression: "&genes[0..5]" +--- +- ensembl_gene_id: ENSG00000004059 + entrez_id: "381" + gene_symbol: ARF5 + exp_lof: 9.0614 + exp_mis: 104.51 + exp_syn: 42.345 + mis_z: 2.2772 + obs_lof: 0 + obs_mis: 39 + obs_syn: 37 + oe_lof: 0 + oe_lof_lower: 0 + oe_lof_upper: 0.33 + oe_mis: 0.37316 + oe_mis_lower: 0.288 + oe_mis_upper: 0.487 + oe_syn: 0.87377 + oe_syn_lower: 0.671 + oe_syn_upper: 1.149 + pli: 0.94298 + syn_z: 0.6457 + exac_pli: 0.234 + exac_obs_lof: 2 + exac_exp_lof: 7.0385 + exac_oe_lof: 0.28415 +- ensembl_gene_id: ENSG00000003056 + entrez_id: "4074" + gene_symbol: M6PR + exp_lof: 15.53 + exp_mis: 155.34 + exp_syn: 57.136 + mis_z: 0.97915 + obs_lof: 6 + obs_mis: 121 + obs_syn: 47 + oe_lof: 0.38635 + oe_lof_lower: 0.211 + oe_lof_upper: 0.763 + oe_mis: 0.77893 + oe_mis_lower: 0.671 + oe_mis_upper: 0.906 + oe_syn: 0.8226 + oe_syn_lower: 0.65 + oe_syn_upper: 1.049 + pli: 0.0092215 + syn_z: 1.0541 + exac_pli: 0.71303 + exac_obs_lof: 2 + exac_exp_lof: 12.531 + exac_oe_lof: 0.15961 +- ensembl_gene_id: ENSG00000004478 + entrez_id: "2288" + gene_symbol: FKBP4 + exp_lof: 23.481 + exp_mis: 250.85 + exp_syn: 95.792 + mis_z: 1.2082 + obs_lof: 6 + obs_mis: 197 + obs_syn: 109 + oe_lof: 0.25553 + oe_lof_lower: 0.139 + oe_lof_upper: 0.504 + oe_mis: 0.78533 + oe_mis_lower: 0.698 + oe_mis_upper: 0.884 + oe_syn: 1.1379 + oe_syn_lower: 0.973 + oe_syn_upper: 1.334 + pli: 0.15539 + syn_z: -1.0608 + exac_pli: 0.92747 + exac_obs_lof: 2 + exac_exp_lof: 16.906 + exac_oe_lof: 0.1183 +- ensembl_gene_id: ENSG00000003137 + entrez_id: "56603" + gene_symbol: CYP26B1 + exp_lof: 16.349 + exp_mis: 328.51 + exp_syn: 150.65 + mis_z: 1.0492 + obs_lof: 1 + obs_mis: 275 + obs_syn: 161 + oe_lof: 0.061167 + oe_lof_lower: 0.021 + oe_lof_upper: 0.29 + oe_mis: 0.83711 + oe_mis_lower: 0.758 + oe_mis_upper: 0.925 + oe_syn: 1.0687 + oe_syn_lower: 0.939 + oe_syn_upper: 1.218 + pli: 0.97987 + syn_z: -0.66253 + exac_pli: 0.95435 + exac_obs_lof: 0 + exac_exp_lof: 9.4081 + exac_oe_lof: 0 +- ensembl_gene_id: ENSG00000003509 + entrez_id: "55471" + gene_symbol: NDUFAF7 + exp_lof: 23.494 + exp_mis: 237.41 + exp_syn: 85.64 + mis_z: -1.0514 + obs_lof: 27 + obs_mis: 283 + obs_syn: 100 + oe_lof: 1.1492 + oe_lof_lower: 0.846 + oe_lof_upper: 1.582 + oe_mis: 1.192 + oe_mis_lower: 1.081 + oe_mis_upper: 1.315 + oe_syn: 1.1677 + oe_syn_lower: 0.992 + oe_syn_upper: 1.379 + pli: 0.00000000000000000010818 + syn_z: -1.2198 + exac_pli: 0.000000000011773 + exac_obs_lof: 17 + exac_exp_lof: 17.154 + exac_oe_lof: 0.99105 + diff --git a/src/strucvars/data/gnomad/snapshots/scarus__strucvars__data__gnomad__test__data_load.snap b/src/strucvars/data/gnomad/snapshots/scarus__strucvars__data__gnomad__test__data_load.snap new file mode 100644 index 0000000..9c058dd --- /dev/null +++ b/src/strucvars/data/gnomad/snapshots/scarus__strucvars__data__gnomad__test__data_load.snap @@ -0,0 +1,8 @@ +--- +source: src/strucvars/data/gnomad/mod.rs +expression: "data.by_hgnc_id(\"HGNC:5\")" +--- +hgnc_id: "HGNC:5" +pli: 0.0000000049917 +oe_lof_upper: 1.208 + diff --git a/src/strucvars/data/gnomad/snapshots/scarus__strucvars__data__gnomad__test__load_file.snap b/src/strucvars/data/gnomad/snapshots/scarus__strucvars__data__gnomad__test__load_file.snap new file mode 100644 index 0000000..76a84e4 --- /dev/null +++ b/src/strucvars/data/gnomad/snapshots/scarus__strucvars__data__gnomad__test__load_file.snap @@ -0,0 +1,20 @@ +--- +source: src/strucvars/data/gnomad/mod.rs +expression: "&records[0..5]" +--- +- hgnc_id: "HGNC:658" + pli: 0.94298 + oe_lof_upper: 0.33 +- hgnc_id: "HGNC:6752" + pli: 0.0092215 + oe_lof_upper: 0.763 +- hgnc_id: "HGNC:3720" + pli: 0.15539 + oe_lof_upper: 0.504 +- hgnc_id: "HGNC:20581" + pli: 0.97987 + oe_lof_upper: 0.29 +- hgnc_id: "HGNC:28816" + pli: 0.00000000000000000010818 + oe_lof_upper: 1.582 + diff --git a/src/strucvars/data/hgnc.rs b/src/strucvars/data/hgnc.rs new file mode 100644 index 0000000..dcb1974 --- /dev/null +++ b/src/strucvars/data/hgnc.rs @@ -0,0 +1,180 @@ +//! HGNC-based gene identifier mapping. + +use std::{io::BufReader, path::Path}; + +/// Mapping of gene identifiers. +#[derive(Debug, Clone, Default, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct GeneIdInfo { + /// HGNC identifier. + pub hgnc_id: String, + /// Official HGNC gene symbol. + #[serde(alias = "gene_symbol")] + pub hgnc_symbol: Option, + /// ENSEMBL gene identifier. + pub ensembl_gene_id: Option, + /// NCBI gene identifier. + #[serde(alias = "entrez_id")] + pub ncbi_gene_id: Option, +} + +/// Load `HGVS` gene mapping file. +/// +/// # Arguments +/// +/// * `path` - Path to the mapping TSV file with the fields +/// `hgnc_id`, `ensembl_gene_id`, `entrez_id`, `gene_symbol`. +/// +/// # Returns +/// +/// Mapping records. +/// +/// # Errors +/// +/// If anything goes wrong, it returns a generic `anyhow::Error`. +pub fn load_file

(path: P) -> Result, anyhow::Error> +where + P: AsRef, +{ + // Construct buffered file and CSV reader. + let reader = std::fs::File::open(path) + .map_err(|e| anyhow::anyhow!("problem opening file: {}", e)) + .map(BufReader::new)?; + let mut csv_reader = csv::ReaderBuilder::new() + .delimiter(b'\t') + .has_headers(true) + .flexible(false) + .from_reader(reader); + let mut result = Vec::new(); + for record in csv_reader.deserialize() { + let record = record.map_err(|e| anyhow::anyhow!("problem parsing record: {}", e))?; + result.push(record); + } + + Ok(result) +} + +/// Facade struct for accessing gene identifier information. +pub struct Data { + /// Gene identifier information. + gene_id_infos: Vec, + /// Mapping from HGNC identifier to gene identifier information. + hgnc_to_infos_idx: rustc_hash::FxHashMap, + /// Mapping from NCBI gene ID to gene identifier information. + ncbi_to_infos_idx: rustc_hash::FxHashMap, + /// Mapping from ENSEMBL gene ID to gene identifier information. + ensg_to_infos_idx: rustc_hash::FxHashMap, +} + +impl Data { + /// Construct from the given path. + /// + /// # Arguments + /// + /// * `path_hgnc` - Path to HGNC gene identifier mapping file. + /// + /// # Returns + /// + /// A new `GeneIdData`. + /// + /// # Errors + /// + /// If anything goes wrong, it returns a generic `anyhow::Error`. + pub fn new

(path_hgnc: P) -> Result + where + P: AsRef, + { + let gene_id_infos = load_file(path_hgnc.as_ref())?; + let hgnc_to_infos_idx = gene_id_infos + .iter() + .enumerate() + .map(|(idx, info)| (info.hgnc_id.clone(), idx)) + .collect(); + let ncbi_to_infos_idx = gene_id_infos + .iter() + .enumerate() + .filter_map(|(idx, info)| info.ncbi_gene_id.as_ref().map(|id| (id.clone(), idx))) + .collect(); + let ensg_to_infos_idx = gene_id_infos + .iter() + .enumerate() + .filter_map(|(idx, info)| info.ensembl_gene_id.as_ref().map(|id| (id.clone(), idx))) + .collect(); + Ok(Self { + gene_id_infos, + hgnc_to_infos_idx, + ncbi_to_infos_idx, + ensg_to_infos_idx, + }) + } + + /// Obtain the gene identifier information for the given HGNC identifier. + /// + /// # Arguments + /// + /// * `hgnc_id` - HGNC identifier. + /// + /// # Returns + /// + /// The gene identifier information, if any available for `hgnc_id`. + pub fn by_hgnc_id(&self, hgnc_id: &str) -> Option<&GeneIdInfo> { + self.hgnc_to_infos_idx + .get(hgnc_id) + .map(|idx| &self.gene_id_infos[*idx]) + } + + /// Obtain the gene identifier information for the given NCBI gene ID. + /// + /// # Arguments + /// + /// * `ncbi_gene_id` - NCBI gene identifier. + /// + /// # Returns + /// + /// The gene identifier information, if any available for `hgnc_id`. + pub fn by_ncbi_gene_id(&self, ncbi_gene_id: &str) -> Option<&GeneIdInfo> { + self.ncbi_to_infos_idx + .get(ncbi_gene_id) + .map(|idx| &self.gene_id_infos[*idx]) + } + + /// Obtain the gene identifier information for the given NCBI gene ID. + /// + /// # Arguments + /// + /// * `ensembl_gene_id` - ENSEMBL gene identifier. + /// + /// # Returns + /// + /// The gene identifier information, if any available for `hgnc_id`. + pub fn by_ensembl_gene_id(&self, ensembl_gene_id: &str) -> Option<&GeneIdInfo> { + self.ensg_to_infos_idx + .get(ensembl_gene_id) + .map(|idx| &self.gene_id_infos[*idx]) + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_load_file() -> Result<(), anyhow::Error> { + let path = "tests/data/hgnc.tsv"; + let records = load_file(path)?; + + assert_eq!(records.len(), 43626); + insta::assert_yaml_snapshot!(&records[0..5]); + + Ok(()) + } + + #[test] + fn test_gene_id_data() -> Result<(), anyhow::Error> { + let data = Data::new("tests/data/hgnc.tsv")?; + + assert_eq!(data.gene_id_infos.len(), 43626); + insta::assert_yaml_snapshot!(data.by_hgnc_id("HGNC:1100")); + + Ok(()) + } +} diff --git a/src/strucvars/data/intervals.rs b/src/strucvars/data/intervals.rs new file mode 100644 index 0000000..faa0c92 --- /dev/null +++ b/src/strucvars/data/intervals.rs @@ -0,0 +1,102 @@ +//! Interval operations for rust-bio Intervals. + +use bio::bio_types::genome::AbstractInterval as _; + +/// The type to work with. +pub type Interval = bio::bio_types::genome::Interval; + +/// Returns whether two intervals overlap. +/// +/// # Arguments +/// +/// * `lhs` - Left-hand side interval. +/// * `rhs` - Right-hand side interval. +/// +/// # Returns +/// +/// `true` if the intervals overlap, `false` otherwise. +pub fn do_overlap(lhs: &Interval, rhs: &Interval) -> bool { + if lhs.contig() != rhs.contig() { + false + } else { + let a = lhs.range(); + let b = rhs.range(); + let intersect = std::cmp::max(a.start, b.start)..std::cmp::min(a.end, b.end); + intersect.start < intersect.end + } +} + +/// Returns whether `lhs` interval contains the `rhs` one. +/// +/// # Arguments +/// +/// * `lhs` - Left-hand side interval. +/// * `rhs` - Right-hand side interval. +/// +/// # Returns +/// +/// `true` if the intervals overlap, `false` otherwise. +pub fn contains(lhs: &Interval, rhs: &Interval) -> bool { + if lhs.contig() != rhs.contig() { + false + } else { + let a = lhs.range(); + let b = rhs.range(); + a.start <= b.start && a.end >= b.end + } +} + +/// Helper function that converts an `TxExonsRecord` into an Interval. +pub fn exon_to_interval(chrom: String, record: &hgvs::data::interface::TxExonsRecord) -> Interval { + Interval::new( + chrom, + (record.alt_start_i as u64)..(record.alt_end_i as u64), + ) +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn interval_overlaps_true() { + let a = Interval::new("chr1".into(), 10..20); + let b = Interval::new("chr1".into(), 15..25); + assert!(do_overlap(&a, &b)); + } + + #[test] + fn interval_overlaps_false() { + let a = Interval::new("chr1".into(), 10..15); + let b = Interval::new("chr1".into(), 15..25); + assert!(!do_overlap(&a, &b)); + } + + #[test] + fn interval_overlaps_different_contig() { + let a = Interval::new("chr1".into(), 10..20); + let b = Interval::new("chr2".into(), 15..25); + assert!(!do_overlap(&a, &b)); + } + + #[test] + fn interval_contains_true() { + let a = Interval::new("chr1".into(), 10..20); + let b = Interval::new("chr1".into(), 15..18); + assert!(contains(&a, &b)); + } + + #[test] + fn interval_contains_false() { + let a = Interval::new("chr1".into(), 10..20); + let b = Interval::new("chr1".into(), 15..25); + assert!(!contains(&a, &b)); + } + + #[test] + fn interval_contains_different_contig() { + let a = Interval::new("chr1".into(), 10..20); + let b = Interval::new("chr2".into(), 15..20); + assert!(!contains(&a, &b)); + } +} diff --git a/src/strucvars/data/mod.rs b/src/strucvars/data/mod.rs new file mode 100644 index 0000000..cf6b20f --- /dev/null +++ b/src/strucvars/data/mod.rs @@ -0,0 +1,7 @@ +//! Representation and I/O of datasets. + +pub mod clingen_dosage; +pub mod decipher_hi; +pub mod gnomad; +pub mod hgnc; +pub mod intervals; diff --git a/src/strucvars/data/snapshots/scarus__strucvars__data__decipher_hi__test__data_load.snap b/src/strucvars/data/snapshots/scarus__strucvars__data__decipher_hi__test__data_load.snap new file mode 100644 index 0000000..539e20d --- /dev/null +++ b/src/strucvars/data/snapshots/scarus__strucvars__data__decipher_hi__test__data_load.snap @@ -0,0 +1,9 @@ +--- +source: src/strucvars/data/decipher_hi.rs +expression: "data.by_hgnc_id(\"HGNC:1100\")" +--- +hgnc_id: "HGNC:1100" +hgnc_symbol: BRCA1 +p_hi: 0.986984945 +hi_index: 1.2 + diff --git a/src/strucvars/data/snapshots/scarus__strucvars__data__hgnc__test__gene_id_data.snap b/src/strucvars/data/snapshots/scarus__strucvars__data__hgnc__test__gene_id_data.snap new file mode 100644 index 0000000..d702d7e --- /dev/null +++ b/src/strucvars/data/snapshots/scarus__strucvars__data__hgnc__test__gene_id_data.snap @@ -0,0 +1,9 @@ +--- +source: src/strucvars/data/hgnc.rs +expression: "data.by_hgnc_id(\"HGNC:1100\")" +--- +hgnc_id: "HGNC:1100" +hgnc_symbol: BRCA1 +ensembl_gene_id: ENSG00000012048 +ncbi_gene_id: "672" + diff --git a/src/strucvars/data/snapshots/scarus__strucvars__data__hgnc__test__load_file.snap b/src/strucvars/data/snapshots/scarus__strucvars__data__hgnc__test__load_file.snap new file mode 100644 index 0000000..95611a5 --- /dev/null +++ b/src/strucvars/data/snapshots/scarus__strucvars__data__hgnc__test__load_file.snap @@ -0,0 +1,25 @@ +--- +source: src/strucvars/data/hgnc.rs +expression: "&records[0..5]" +--- +- hgnc_id: "HGNC:5" + hgnc_symbol: A1BG + ensembl_gene_id: ENSG00000121410 + ncbi_gene_id: "1" +- hgnc_id: "HGNC:37133" + hgnc_symbol: A1BG-AS1 + ensembl_gene_id: ENSG00000268895 + ncbi_gene_id: "503538" +- hgnc_id: "HGNC:24086" + hgnc_symbol: A1CF + ensembl_gene_id: ENSG00000148584 + ncbi_gene_id: "29974" +- hgnc_id: "HGNC:7" + hgnc_symbol: A2M + ensembl_gene_id: ENSG00000175899 + ncbi_gene_id: "2" +- hgnc_id: "HGNC:27057" + hgnc_symbol: A2M-AS1 + ensembl_gene_id: ENSG00000245105 + ncbi_gene_id: "144571" + diff --git a/src/strucvars/ds.rs b/src/strucvars/ds.rs new file mode 100644 index 0000000..09d1466 --- /dev/null +++ b/src/strucvars/ds.rs @@ -0,0 +1,63 @@ +//! Shared data structures for `strucvars`. + +use super::data::intervals::Interval; + +/// Enumeration for SV type. +#[derive( + Debug, + Clone, + Copy, + PartialEq, + Eq, + PartialOrd, + Ord, + Hash, + Default, + serde::Deserialize, + serde::Serialize, +)] +pub enum SvType { + /// Deletion, copy number loss. + #[default] + Del, + /// Duplication, copy number gain. + Dup, +} + +/// Representation of inner / outer coordinates. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, serde::Deserialize, serde::Serialize)] +pub struct AmbiguousRange { + /// Outer start position. + pub outer_start: u32, + /// Inner start position. + pub inner_start: u32, + /// Inner end position. + pub inner_end: u32, + /// Outer end position. + pub outer_end: u32, +} + +/// Representation of a structural variant. +#[derive(Debug, Clone, PartialEq, Default, serde::Deserialize, serde::Serialize)] +pub struct StructuralVariant { + /// Chromosome. + pub chrom: String, + /// 1-based start position. + pub start: u32, + /// 1-based stop position. + pub stop: u32, + /// SV type. + pub svtype: SvType, + + /// Optional ambiguous start/end positions. + pub ambiguous_range: Option, +} + +impl From for Interval { + fn from(val: StructuralVariant) -> Self { + Interval::new( + val.chrom, + val.start.saturating_sub(1).into()..val.stop.into(), + ) + } +} diff --git a/src/strucvars/eval/common.rs b/src/strucvars/eval/common.rs new file mode 100644 index 0000000..2176f12 --- /dev/null +++ b/src/strucvars/eval/common.rs @@ -0,0 +1,16 @@ +//! Common functions and data strucdtures for evaluation. + +/// Scores of the results in individual categories. +pub trait SuggestedScore { + /// Suggested score for the category. + fn suggested_score(&self) -> f32; +} + +/// Score range for a seciton. +pub trait ScoreRange { + /// Minimal score for the category. + fn min_score(&self) -> f32; + + /// Maximum score for the category. + fn max_score(&self) -> f32; +} diff --git a/src/strucvars/eval/del/mod.rs b/src/strucvars/eval/del/mod.rs new file mode 100644 index 0000000..4cbc7d7 --- /dev/null +++ b/src/strucvars/eval/del/mod.rs @@ -0,0 +1,106 @@ +//! Evaluation of deletions, loss of copy number. +//! +//! Note that in Section 4, only 4O can be fully automatically determined and we do not have +//! support for any automatic annotation for Section 5. We only provide data structures for +//! storing sections where we can provide automatic annotation. + +pub mod result; +pub mod section1; +pub mod section2; +pub mod section3; +pub mod section4; + +use crate::strucvars::ds::StructuralVariant; + +/// Evaluation of deletions, loss of copy number. +/// +/// This is mainly used to encapsulate the functionality. Creating new such +/// objects is very straightforward and cheap. +pub struct Evaluator<'a> { + /// The parent evaluator. + parent: &'a super::Evaluator, +} + +impl<'a> Evaluator<'a> { + /// Create a new `Evaluator`. + pub fn with_parent(parent: &'a super::Evaluator) -> Self { + Self { parent } + } + + /// Perform the evaluation of the different sections. + /// + /// # Arguments + /// + /// * `strucvar` - Structural variant to be evaluated. + /// + /// # Returns + /// + /// The evaluation result. + /// + /// # Errors + /// + /// If anything goes wrong, it returns a generic `anyhow::Error`. + pub fn evaluate( + &self, + strucvar: &StructuralVariant, + ) -> Result, anyhow::Error> { + // Evaluate section 1: "Contains known functionally important elements". + let result_s1 = section1::Evaluator::with_parent(self.parent).evaluate(strucvar)?; + // Evaluate section 2: "Overlap wih established/predicted haploinsufficient (HI) or + // established benign genes/genomic regions (skip to seciton 3 if your copy number loss + // does not overlap these types of genes/regions)". + let result_s2 = section2::Evaluator::with_parent(self.parent).evaluate(strucvar)?; + // Evaluate section 3: "Evaluation of gene number". + let result_s3 = section3::Evaluator::new().evaluate(&result_s1)?; + // Evaluate section 4: "Detailed evaluation of genomic content using cases from published literature, + // public databases, and/or internal lab data". + let result_s4 = section4::Evaluator::with_parent(self.parent).evaluate(strucvar)?; + + let mut result = Vec::default(); + result.push(result_s1); + result.append(&mut result_s2.clone()); + result.push(result_s3); + result.append(&mut result_s4.clone()); + Ok(result) + } +} + +#[cfg(test)] +mod test { + use super::super::test::global_evaluator_37; + use crate::strucvars::ds::{StructuralVariant, SvType}; + use crate::strucvars::eval::common::SuggestedScore; + + // #[tracing_test::traced_test] + #[rstest::rstest] + #[case("1", 11_937_319, 11_944_386, -0.6)] // empty region left of MFN2 + #[case("1", 12_098_550, 12_103_898, -0.6)] // empty region right of MFN2 + #[case("1", 12_050_913, 12_054_733, 0.0)] // contains exon 4 of MFN2 + fn test_evaluate( + #[case] chrom: &str, + #[case] start: u32, + #[case] stop: u32, + #[case] expected_score: f32, + global_evaluator_37: super::super::Evaluator, + ) -> Result<(), anyhow::Error> { + mehari::common::set_snapshot_suffix!("{}-{}-{}", chrom, start, stop); + + let evaluator = super::Evaluator::with_parent(&global_evaluator_37); + let result = evaluator.evaluate(&StructuralVariant { + chrom: chrom.into(), + start, + stop, + svtype: SvType::Del, + ..Default::default() + })?; + + assert_eq_float::assert_eq_float!( + result[0].suggested_score(), + expected_score, + f32::EPSILON + ); + insta::assert_yaml_snapshot!(result); + + Ok(()) + } +} diff --git a/src/strucvars/eval/del/result.rs b/src/strucvars/eval/del/result.rs new file mode 100644 index 0000000..aaeebb7 --- /dev/null +++ b/src/strucvars/eval/del/result.rs @@ -0,0 +1,520 @@ +//! Data structures for representing the actual results. + +use crate::strucvars::{ + data::{clingen_dosage, hgnc::GeneIdInfo}, + ds::StructuralVariant, + eval::common::{ScoreRange, SuggestedScore}, +}; + +/// Evaluation results for each section of the ACMG rule. +#[derive(Debug, Clone, PartialEq, serde::Deserialize, serde::Serialize)] +pub enum Section { + /// Results of Section L1. + L1(L1), + /// Results of Section L2. + L2(L2), + /// Results of Section L3. + L3(L3), + /// Results of Section L4. + L4(L4), +} + +impl SuggestedScore for Section { + fn suggested_score(&self) -> f32 { + match self { + Section::L1(L1::L1A(l1a)) => l1a.suggested_score(), + Section::L1(L1::L1B(l1b)) => l1b.suggested_score(), + Section::L2(L2::L2A(l2a)) => l2a.suggested_score(), + Section::L2(L2::L2B(l2b)) => l2b.suggested_score(), + Section::L2(L2::L2C1(l2c1)) => l2c1.suggested_score(), + Section::L2(L2::L2C2(l2c2)) => l2c2.suggested_score(), + Section::L2(L2::L2D1(l2d1)) => l2d1.suggested_score(), + Section::L2(L2::L2D2(l2d2)) => l2d2.suggested_score(), + Section::L2(L2::L2D3(l2d3)) => l2d3.suggested_score(), + Section::L2(L2::L2D4(l2d4)) => l2d4.suggested_score(), + Section::L2(L2::L2E(l2e)) => l2e.suggested_score(), + Section::L2(L2::L2F(l2f)) => l2f.suggested_score(), + Section::L2(L2::L2G(l2g)) => l2g.suggested_score(), + Section::L2(L2::L2H(l2h)) => l2h.suggested_score(), + Section::L3(L3::L3A(l3a)) => l3a.suggested_score(), + Section::L3(L3::L3B(l3b)) => l3b.suggested_score(), + Section::L3(L3::L3C(l3c)) => l3c.suggested_score(), + Section::L4(L4::L4Dangling(l4x)) => l4x.suggested_score(), + Section::L4(L4::L4O(l4o)) => l4o.suggested_score(), + } + } +} + +impl ScoreRange for Section { + fn min_score(&self) -> f32 { + match self { + Section::L1(L1::L1A(_)) => 0.0, + Section::L1(L1::L1B(_)) => -0.6, + Section::L2(L2::L2A(_)) => 1.0, + Section::L2(L2::L2B(_)) => 0.0, + Section::L2(L2::L2C1(_)) => 0.9, + Section::L2(L2::L2C2(_)) => 0.0, + Section::L2(L2::L2D1(_)) => 0.0, + Section::L2(L2::L2D2(_)) => 0.45, + Section::L2(L2::L2D3(_)) => 0.0, + Section::L2(L2::L2D4(_)) => 0.45, + Section::L2(L2::L2E(_)) => 0.0, + Section::L2(L2::L2F(_)) => -1.0, + Section::L2(L2::L2G(_)) => 0.0, + Section::L2(L2::L2H(_)) => 0.15, + Section::L3(L3::L3A(_)) => 0.0, + Section::L3(L3::L3B(_)) => 0.45, + Section::L3(L3::L3C(_)) => 0.9, + Section::L4(L4::L4Dangling(_)) => -0.9, + Section::L4(L4::L4O(_)) => -1.0, + } + } + + fn max_score(&self) -> f32 { + match self { + Section::L1(L1::L1A(_)) => 0.0, + Section::L1(L1::L1B(_)) => -0.6, + Section::L2(L2::L2A(_)) => 1.0, + Section::L2(L2::L2B(_)) => 0.0, + Section::L2(L2::L2C1(_)) => 1.0, + Section::L2(L2::L2C2(_)) => 0.45, + Section::L2(L2::L2D1(_)) => 0.0, + Section::L2(L2::L2D2(_)) => 0.90, + Section::L2(L2::L2D3(_)) => 0.45, + Section::L2(L2::L2D4(_)) => 1.0, + Section::L2(L2::L2E(_)) => 0.9, + Section::L2(L2::L2F(_)) => -1.0, + Section::L2(L2::L2G(_)) => 0.0, + Section::L2(L2::L2H(_)) => 0.15, + Section::L3(L3::L3A(_)) => 0.0, + Section::L3(L3::L3B(_)) => 0.45, + Section::L3(L3::L3C(_)) => 0.9, + Section::L4(L4::L4Dangling(_)) => 0.45, + Section::L4(L4::L4O(_)) => 0.0, + } + } +} + +/// Enumeration of the categories for the structural variant evaluation, Section 1. +#[derive(Debug, Clone, PartialEq, serde::Deserialize, serde::Serialize)] +pub enum L1 { + /// Contains protein-coding or other known functionally important elements. + L1A(L1A), + /// Does NOT contain protein-coding or any functionally important elements. + L1B(L1B), +} + +/// Per-gene transcript overlaps as part of `L1A`. +#[derive(Debug, Clone, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct GeneOverlap { + /// Gene identifiers. + pub gene: GeneIdInfo, + /// Transcript identifiers of this gene. + pub tx_ids: Vec, +} + +impl GeneOverlap { + /// Create a new `GeneOverlap`. + /// + /// # Arguments + /// + /// * `gene` - Gene identifier. + /// * `tx_ids` - Transcript identifiers of this gene. + /// + /// # Returns + /// + /// A new `GeneOverlap`. + pub fn new(gene: GeneIdInfo, tx_ids: Vec) -> Self { + Self { gene, tx_ids } + } +} + +/// Result of the L1A subsection (important feature). +#[derive(Debug, Clone, PartialEq, Default, serde::Deserialize, serde::Serialize)] +pub struct L1A { + /// Overlapping transcripts/genes. + pub genes: Vec, +} + +impl SuggestedScore for L1A { + fn suggested_score(&self) -> f32 { + 0.0 + } +} + +/// Result of the L1B subsection (no important feature). +#[derive(Debug, Clone, PartialEq, Default, serde::Deserialize, serde::Serialize)] +pub struct L1B { + // no members +} + +impl SuggestedScore for L1B { + fn suggested_score(&self) -> f32 { + -0.6 + } +} + +/// Enumeration of the categories for the structural variant evaluation, Section 2. +#[derive(Debug, Clone, PartialEq, serde::Deserialize, serde::Serialize)] +pub enum L2 { + /// Complete overlap of an established HI gene/genomic region. + L2A(L2A), + /// Partial overlap of an established HI genomic region. + L2B(L2B), + /// Partial overlap of 5' end of established HI gene ... coding sequence is involved. + L2C1(L2C1), + /// Partial overlap of 5' end of established HI gene ... only the 5' UTR is involved. + L2C2(L2C2), + /// Partial overlap of 3' end of established HI gene ... only the 3' UTR is involved. + L2D1(L2D1), + /// Partial overlap of 3' end of established HI gene ... only the last exon is involved, + /// etablished pathogenic variants exist in this exon. + L2D2(L2D2), + /// Partial overlap of 3' end of established HI gene ... only the last exon is involved, + /// no established pathogenic variants exist in this exon. + L2D3(L2D3), + /// Partial overlap of 3' end of established HI gene ... it includes other exons in addition + /// to the last exon; nonsense-mediated decay is expected to occur. + L2D4(L2D4), + /// Both breakpoints are within the same gene (intragenic CNV, gene-level sequence variant + /// ... PVS1 rules apply). + L2E(L2E), + /// Completely contained within an established benign CNV region. + L2F(L2F), + /// Overlaps with an established benign CNV, but includes additional genomic material. + L2G(L2G), + /// Two or more HI predictors suggest that AT LEAST ONE gene in the interval is HI. + L2H(L2H), +} + +/// Result of the L2A subsection (complete overlap of an established HI gene/genomic region). +#[derive(Debug, Clone, PartialEq, Default, serde::Deserialize, serde::Serialize)] +pub struct L2A { + /// Suggested score for the subsection. + pub suggested_score: f32, + /// Overlapping HI genes. + pub hi_genes: Vec, + /// Overlapping HI genomic regions. + pub hi_regions: Vec, +} + +impl SuggestedScore for L2A { + fn suggested_score(&self) -> f32 { + self.suggested_score + } +} + +/// Result of the L2B subsection (partial overlap of an established HI genomic region). +#[derive(Debug, Clone, PartialEq, Default, serde::Deserialize, serde::Serialize)] +pub struct L2B { + // no members, just signal to the user +} + +impl SuggestedScore for L2B { + fn suggested_score(&self) -> f32 { + 0.0 + } +} + +/// Result of the L2C1 subsection (partial overlap of 5' end of established HI gene +/// ... coding sequence is involved). +#[derive(Debug, Clone, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct L2C1 { + /// Suggested score for the subsection. + pub suggested_score: f32, + /// Overlapping gene information. + pub gene: GeneIdInfo, +} + +impl SuggestedScore for L2C1 { + fn suggested_score(&self) -> f32 { + self.suggested_score + } +} + +/// Result of the L2C2 subsection (partial overlap of 5' end of established HI gene +/// ... only the 5' UTR is involved). +#[derive(Debug, Clone, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct L2C2 { + /// Suggested score for the subsection. + pub suggested_score: f32, + /// Overlapping gene information. + pub gene: GeneIdInfo, +} + +impl SuggestedScore for L2C2 { + fn suggested_score(&self) -> f32 { + self.suggested_score + } +} + +/// Result of the L2D1 subsection (partial overlap of 3' end of established HI gene +/// ... only the 3' UTR is involved). +#[derive(Debug, Clone, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct L2D1 { + /// Overlapping gene information. + pub gene: GeneIdInfo, +} + +impl SuggestedScore for L2D1 { + fn suggested_score(&self) -> f32 { + 0.0 + } +} + +/// Result of the L2D2 subsection (partial overlap of 3' end of established HI gene +/// ... only the last exon is involved; other established pathogenic variants have +/// been reported in this exon). +#[derive(Debug, Clone, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct L2D2 { + /// Suggested score for the subsection. + pub suggested_score: f32, + /// Overlappign gene information. + pub gene: GeneIdInfo, + /// VCV identifiers of ClinVar variants supporting the score. + pub clinvar_ids: Vec, +} + +impl SuggestedScore for L2D2 { + fn suggested_score(&self) -> f32 { + self.suggested_score + } +} + +/// Result of the L2D3 subsection (partial overlap of 3' end of established HI gene +/// ... only the last exon is involved; no other established pathogenic variants have +/// been reported in this exon). +#[derive(Debug, Clone, PartialEq, Default, serde::Deserialize, serde::Serialize)] +pub struct L2D3 { + /// Suggested score for the subsection. + pub suggested_score: f32, + /// Overlapping gene information. + pub gene: GeneIdInfo, +} + +impl SuggestedScore for L2D3 { + fn suggested_score(&self) -> f32 { + self.suggested_score + } +} + +/// Result of the L2D4 subsection (partial overlap of 3' end of established HI gene +/// ... it includes other exons in addition to the last exon; nonsense-mediate +/// decay is expected to occur). +#[derive(Debug, Clone, PartialEq, Default, serde::Deserialize, serde::Serialize)] +pub struct L2D4 { + /// Suggested score for the subsection. + pub suggested_score: f32, + /// Overlapping gene information. + pub gene: GeneIdInfo, + /// Numbers of the overlapping exons. + pub exon_nos: Vec, +} + +impl SuggestedScore for L2D4 { + fn suggested_score(&self) -> f32 { + self.suggested_score + } +} + +/// Enumeration describing the PVS1 results. +#[derive( + Debug, + Clone, + Copy, + PartialEq, + Eq, + PartialOrd, + Ord, + Default, + serde::Deserialize, + serde::Serialize, +)] +pub enum Pvs1Result { + /// PVS1 + #[default] + Pvs1, + /// PVS1_Strong + Pvs1Strong, + /// PVS1_Moderate + Pvs1Moderate, + /// PVS1_Supporting + Pvs1Supporting, +} + +/// Result of the L2E subsection (both breakpoints are within the same gene (intragenic +/// CNV, gene-level sequence variant). PVS1 rules apply from ClinGen SVI WG. +#[derive(Debug, Clone, PartialEq, Default, serde::Deserialize, serde::Serialize)] +pub struct L2E { + /// Suggested score for the subsection. + pub suggested_score: f32, + /// VCV identifiers of ClinVar variants supporting the score. + pub pvs1_result: Pvs1Result, +} + +impl SuggestedScore for L2E { + fn suggested_score(&self) -> f32 { + self.suggested_score + } +} + +/// Result of the L2F subsection (completely contained with an established benign CNV region). +#[derive(Debug, Clone, PartialEq, Default, serde::Deserialize, serde::Serialize)] +pub struct L2F { + /// Suggested score for the subsection. + pub suggested_score: f32, + /// Overlapping CNV benign genomic regions. + pub benign_regions: Vec, +} + +impl SuggestedScore for L2F { + fn suggested_score(&self) -> f32 { + self.suggested_score + } +} + +/// Result of the L2G subsection (overlaps with an established benign CNV, but includes additional +/// genomic material. +#[derive(Debug, Clone, PartialEq, Default, serde::Deserialize, serde::Serialize)] +pub struct L2G { + /// Suggested score for the subsection. + pub suggested_score: f32, + /// Overlapping CNV benign genomic regions. + pub benign_regions: Vec, +} + +impl SuggestedScore for L2G { + fn suggested_score(&self) -> f32 { + self.suggested_score + } +} + +/// HI predictor results for use in `L2H`. +#[derive(Debug, Clone, PartialEq, serde::Deserialize, serde::Serialize)] +pub enum GeneHiPredictorResult { + /// gnomAD pLI results + GnomadPli { + /// gnomAD pLI score + pli_score: f64, + /// upper bound of LoF observed/expected confidence interval + oe_lof_upper: f64, + }, + /// DECIPHER HI index + DecipherHiIndex { + /// DECIPHER HI index + hi_index: f64, + }, +} + +/// HI prediction results for a single gene. +#[derive(Debug, Clone, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct GeneHiPrediction { + /// Identifier of the gene. + pub gene: GeneIdInfo, + /// Results of the HI predictors. + pub results: Vec, +} + +/// Result of the L2H subsection (two or more HI predictors suggest that AT LEAST ONE gene in the +/// interval is HI). +#[derive(Debug, Clone, PartialEq, Default, serde::Deserialize, serde::Serialize)] +pub struct L2H { + /// Suggested score for the subsection. + pub suggested_score: f32, + /// HGNC identifiers of the overlapping predicted HI genes. + pub gene_hi_predictions: Vec, +} + +impl SuggestedScore for L2H { + fn suggested_score(&self) -> f32 { + self.suggested_score + } +} + +/// Enumeration of the categories for the structural variant evaluation, Section 2. +#[derive(Debug, Clone, PartialEq, serde::Deserialize, serde::Serialize)] +pub enum L3 { + /// <=24 genes. + L3A(L3Count), + /// 25-34 genes. + L3B(L3Count), + /// >=35 genes. + L3C(L3Count), +} + +/// Result of the 3A subsection (Number of protein-coding RefSeq genes wholly or partially included +/// in the copy-number loss). +#[derive(Debug, Clone, PartialEq, Default, serde::Deserialize, serde::Serialize)] +pub struct L3Count { + /// Suggested score for the subsection. + pub suggested_score: f32, + /// Number of protein-coding RefSeq genes wholly or partially included in the copy-number loss. + pub num_genes: usize, + /// The overlapping genes. + pub genes: Vec, +} + +impl SuggestedScore for L3Count { + fn suggested_score(&self) -> f32 { + self.suggested_score + } +} + +/// Enumeration of the categories for the structural variant evaluation, Section 4. +/// +/// Only 4O can be automatically determined. +#[derive(Debug, Clone, PartialEq, serde::Deserialize, serde::Serialize)] +pub enum L4 { + /// "Dangling" information of overlapping variants - must be resolved by a human. + L4Dangling(L4Dangling), + /// Case–control and population evidence; Overlap with common population variation. + L4O(L4O), +} + +/// Information abbout a SV from ClinVar (RCV level; dependent on condition). +#[derive(Debug, Clone, PartialEq, Default, serde::Deserialize, serde::Serialize)] +pub struct ClinvarRcvRecord { + /// ClinVar RCV identifier. + pub rcv: String, + /// The free text condition description. + pub condition: String, +} + +/// Information about a SV from ClinVar (VCV level; independent of condition). +#[derive(Debug, Clone, PartialEq, Default, serde::Deserialize, serde::Serialize)] +pub struct ClinvarStructuralVariant { + /// ClinVar VCV identifier. + pub vcv: String, + /// The structural variant. + pub sv: StructuralVariant, + /// The RCVs, corresponding to occurence of variant in a case with a condition. + pub rcv_records: Vec, +} + +/// Result of the 4O subsection (overlap with common population variation). +#[derive(Debug, Clone, PartialEq, Default, serde::Deserialize, serde::Serialize)] +pub struct L4Dangling { + /// Accession identifiers of overlapping variants. + pub common_variant_ids: Vec, +} + +impl SuggestedScore for L4Dangling { + fn suggested_score(&self) -> f32 { + 0.0 + } +} + +/// Result of the 4O subsection (overlap with common population variation). +#[derive(Debug, Clone, PartialEq, Default, serde::Deserialize, serde::Serialize)] +pub struct L4O { + /// Suggested score for the subsection. + pub suggested_score: f32, + /// Accession identifiers of overlapping common variants. + pub common_variant_ids: Vec, +} + +impl SuggestedScore for L4O { + fn suggested_score(&self) -> f32 { + self.suggested_score + } +} diff --git a/src/strucvars/eval/del/section1.rs b/src/strucvars/eval/del/section1.rs new file mode 100644 index 0000000..0d2b400 --- /dev/null +++ b/src/strucvars/eval/del/section1.rs @@ -0,0 +1,138 @@ +//! Implementation of evaluation of copy number loss section 1. + +use hgvs::data::interface::Provider; +use itertools::Itertools; + +use super::result::{Section, L1, L1A, L1B}; +use crate::strucvars::{ds::StructuralVariant, eval::del::result::GeneOverlap}; + +/// Evaluation of deletions, loss of copy number. +/// +/// This is mainly used to encapsulate the functionality. Creating new such +/// objects is very straightforward and cheap. +pub struct Evaluator<'a> { + /// The parent evaluator. + parent: &'a super::super::Evaluator, +} + +impl<'a> Evaluator<'a> { + /// Create a new `Evaluator`. + pub fn with_parent(parent: &'a super::super::Evaluator) -> Self { + Self { parent } + } + + /// Perform the evaluation of copy number loss Section 1 and all subsection. + /// + /// # Arguments + /// + /// * `strucvar` - Structural variant to be evaluated. + /// + /// # Returns + /// + /// Returns the evaluation results of the section + /// + /// # Errors + /// + /// If anything goes wrong, it returns a generic `anyhow::Error`. + pub fn evaluate(&self, strucvar: &StructuralVariant) -> Result { + let genes = self.overlaps_with_elements(strucvar).map_err(|e| { + anyhow::anyhow!("issue with overlap computation of {:?}: {}", strucvar, e) + })?; + + if !genes.is_empty() { + Ok(Section::L1(L1::L1A(L1A { genes }))) + } else { + Ok(Section::L1(L1::L1B(L1B::default()))) + } + } + + /// Determines whether `strucvar` overlaps with any functionally important elements. + /// + /// # Arguments + /// + /// * `strucvar` - Structural variant to be evaluated. + /// + /// # Returns + /// + /// Overlapping gene / transcript information. + /// + /// # Errors + /// + /// When the chromosome name could not be resolved or there was a problem with + /// accessing the transcript database. + fn overlaps_with_elements( + &self, + strucvar: &StructuralVariant, + ) -> Result, anyhow::Error> { + // Map chromosome name (e.g., chr1) to chromosome accession in this assembly. + let chrom_acc = self + .parent + .chrom_to_ac + .get(&strucvar.chrom) + .ok_or_else(|| { + anyhow::anyhow!("could not resolve chromosome name `{}`", strucvar.chrom) + })?; + + // Obtain the overlapping transcripts for the given region. + let txs = self + .parent + .provider + .get_tx_for_region( + chrom_acc, + "splign", + strucvar + .start + .try_into() + .map_err(|e| anyhow::anyhow!("could not convert start position: {}", e))?, + strucvar + .stop + .try_into() + .map_err(|e| anyhow::anyhow!("could not convert stop position: {}", e))?, + ) + .map_err(|e| anyhow::anyhow!("problem query transcript database with range: {}", e))?; + + // Extract HGNC ids / transcript ids of coding transcripts. + let mut gene_txs = txs + .into_iter() + .filter_map(|tx| { + let tx_info = self + .parent + .provider + .get_tx_info(&tx.tx_ac, &tx.alt_ac, "splign") + .expect("no tx info?"); + assert!(tx_info.hgnc.starts_with("HGNC:")); + if tx_info.cds_start_i.is_some() && tx_info.cds_end_i.is_some() { + Some((tx_info.hgnc, tx_info.tx_ac)) + } else { + None + } + }) + .collect::>(); + gene_txs.sort(); + + // Group by and collect for each gene HGNC ID. + let gene_ovls = gene_txs + .into_iter() + .group_by(|(hgnc, _)| hgnc.clone()) + .into_iter() + .map(|(hgnc, group)| { + let txs = group.map(|(_, tx)| tx).collect::>(); + let gene = self + .parent + .gene_id_data + .by_hgnc_id(&hgnc) + .expect("could not resolve HGNC ID") + .clone(); + GeneOverlap::new(gene, txs) + }) + .collect::>(); + + tracing::trace!( + "found gene overlaps for strucvar {:?}: {:?}", + &strucvar, + &gene_ovls + ); + + Ok(gene_ovls) + } +} diff --git a/src/strucvars/eval/del/section2.rs b/src/strucvars/eval/del/section2.rs new file mode 100644 index 0000000..bc89b21 --- /dev/null +++ b/src/strucvars/eval/del/section2.rs @@ -0,0 +1,1061 @@ +//! Implementation of evaluation of copy number loss section 2. + +use annonars::{clinvar_minimal::pbs::ClinicalSignificance, clinvar_minimal::pbs::ReviewStatus}; +use bio::bio_types::genome::AbstractInterval; +use hgvs::data::interface::{Provider, TxExonsRecord, TxInfoRecord}; + +use super::result::{GeneHiPrediction, GeneHiPredictorResult, Section, L2H}; +use crate::strucvars::{ + data::{ + clingen_dosage::{Gene, Region, Score}, + intervals::{contains, do_overlap, exon_to_interval, Interval}, + }, + ds::StructuralVariant, + eval::del::result::{Pvs1Result, L2, L2A, L2B, L2C1, L2C2, L2D1, L2D2, L2D4, L2E, L2F, L2G}, + eval::{common::SuggestedScore as _, del::result::L2D3}, +}; + +/// Evaluation of deletions, loss of copy number. +/// +/// This is mainly used to encapsulate the functionality. Creating new such +/// objects is very straightforward and cheap. +pub struct Evaluator<'a> { + /// The parent evaluator. + parent: &'a super::super::Evaluator, +} + +impl<'a> Evaluator<'a> { + /// Create a new `Evaluator`. + pub fn with_parent(parent: &'a super::super::Evaluator) -> Self { + Self { parent } + } + + /// Perform the evaluation of copy number loss Section 2 and all subsection. + /// + /// # Arguments + /// + /// * `strucvar` - Structural variant to be evaluated. + /// + /// # Returns + /// + /// The evaluation result. + /// + /// # Errors + /// + /// If anything goes wrong, it returns a generic `anyhow::Error`. + pub fn evaluate(&self, strucvar: &StructuralVariant) -> Result, anyhow::Error> { + tracing::debug!("evaluating section 2 for {:?}", strucvar); + let sv_interval: Interval = strucvar.clone().into(); + + // Obtain any overlapping ClinGen region and gene dosage info. + let (clingen_genes, clingen_regions) = self.clingen_overlaps(&sv_interval); + + // Check each overlapping ClinGen region/gene. If `sv_interval` completely contains + // a region/gene and the region/gene has a "sufficient evidence" score for HI then we + // have case 2A. + if let Some(result) = Self::handle_case_2a(&clingen_genes, &clingen_regions, &sv_interval) { + tracing::debug!("case 2A fired: {:?}", &result); + return Ok(result); + } + // Otherwise, we need to test case 2B. + let has_hi_genes = clingen_genes + .iter() + .any(|clingen_gene| clingen_gene.haploinsufficiency_score == Score::SufficientEvidence); + let mut result = if has_hi_genes { + let result = vec![Section::L2(L2::L2B(L2B::default()))]; + tracing::debug!("case 2B fired: {:?}", &result); + result + } else { + tracing::debug!("negative for case 2B"); + Vec::new() + }; + + // Handle cases 2C-2E. + if let Some(result_inner) = self.handle_cases_2c_2e(&sv_interval, &clingen_genes)? { + result.push(result_inner.clone()); + if !matches!(result_inner, Section::L2(L2::L2D1(L2D1 { .. }))) { + // Return in all but case 2D-1 (2D-1 = only 3' UTR is involved) + return Ok(result); + } + } + + // Check each overlapping ClinGen region for case 2F: if `sv_interval` is + // completely contained in this region and the region has a "dosage sensitivity + // unlikely" score. + let benign_regions = clingen_regions + .iter() + .filter(|clingen_region| { + clingen_region.haploinsufficiency_score == Score::DosageSensitivityUnlikely + }) + .cloned() + .collect::>(); + if let Some(value) = Self::handle_case_2f(&benign_regions, &sv_interval, &result) { + return Ok(value); + } + // If there is any overlap with a benign region then case 2G is true. + let result = { + let mut result = result; + if !benign_regions.is_empty() { + // Case 2G positive. + tracing::debug!("case 2G positive"); + result.push(Section::L2(L2::L2G(L2G { + suggested_score: 0.0, + benign_regions, + }))); + } else { + // Case 2G negative. + tracing::debug!("case 2G negative"); + }; + result + }; + + // Handle Case 2H: Two or more HI predictors suggest AT LEAST ONE gene in the interval is HI. + let result = if let Some(result_2h) = self.handle_case_2h(&sv_interval)? { + let mut result = result; + result.push(result_2h); + tracing::debug!("case 2G+2H fired: {:?}", &result); + result + } else { + tracing::debug!("case 2G fired: {:?}", &result); + result + }; + Ok(result) + } + + /// Query ClinGen by overlap. + fn clingen_overlaps( + &self, + sv_interval: &bio::bio_types::genome::Interval, + ) -> (Vec, Vec) { + let clingen_genes = self.parent.clingen_dosage_data.gene_by_overlap(sv_interval); + let clingen_regions = self + .parent + .clingen_dosage_data + .region_by_overlap(sv_interval); + tracing::debug!( + "overlaps with {} ClinGen regions and {} genes", + clingen_regions.len(), + clingen_genes.len() + ); + (clingen_genes, clingen_regions) + } + + /// Handle case 2A. + fn handle_case_2a( + clingen_genes: &[Gene], + clingen_regions: &[Region], + sv_interval: &bio::bio_types::genome::Interval, + ) -> Option> { + let hi_genes = clingen_genes + .iter() + .filter(|clingen_gene| { + let clingen_interval: Interval = (*clingen_gene) + .clone() + .try_into() + .expect("no interval for gene"); + contains(sv_interval, &clingen_interval) + && clingen_gene.haploinsufficiency_score == Score::SufficientEvidence + }) + .cloned() + .collect::>(); + let hi_regions = clingen_regions + .iter() + .cloned() + .filter(|clingen_region| { + let clingen_interval: Interval = clingen_region + .clone() + .try_into() + .expect("no interval for region"); + contains(sv_interval, &clingen_interval) + && clingen_region.haploinsufficiency_score == Score::SufficientEvidence + }) + .collect::>(); + if !hi_regions.is_empty() || !hi_genes.is_empty() { + Some(vec![Section::L2(L2::L2A(L2A { + suggested_score: 1.0, + hi_genes, + hi_regions, + }))]) + } else { + None + } + } + + // Handle cases 2C-2E. + fn handle_cases_2c_2e( + &self, + sv_interval: &bio::bio_types::genome::Interval, + clingen_genes: &[Gene], + ) -> Result, anyhow::Error> { + // Get HGNC ids of HI genes. + let hi_hgnc_ids = clingen_genes + .iter() + .filter_map(|clingen_gene| { + if clingen_gene.haploinsufficiency_score == Score::SufficientEvidence { + self.parent + .gene_id_data + .by_ncbi_gene_id(&clingen_gene.ncbi_gene_id) + .map(|gene_id_info| gene_id_info.hgnc_id.clone()) + } else { + None + } + }) + .collect::>(); + // Obtain all transcripts of established HI genes. + if let Some(alt_acc) = self.parent.chrom_to_ac.get(sv_interval.contig()) { + let tx_ids = self + .parent + .provider + .get_tx_for_region( + alt_acc, + "splign", + sv_interval.range().start as i32, + sv_interval.range().end as i32, + ) + .map_err(|e| anyhow::anyhow!("failed to get transcripts for region: {}", e))? + .into_iter() + .filter_map(|tx| { + // filter to the transcripts of the HI genes + self.parent.provider.get_tx(&tx.tx_ac).and_then(|tx| { + if hi_hgnc_ids.contains(&tx.gene_id) { + Some(tx.id) + } else { + None + } + }) + }) + .collect::>(); + let mut sections = tx_ids + .into_iter() + .filter_map(|tx_id| { + let mut exons = self + .parent + .provider + .get_tx_exons(&tx_id, alt_acc, "splign") + .expect("no exons for transcript?"); + // Ensure exons are sorted by reference start position. + exons.sort_by_key(|exon| exon.alt_start_i); + let tx_info = self + .parent + .provider + .get_tx_info(&tx_id, alt_acc, "splign") + .expect("no tx info for transcript?"); + // Now, test cases 2C..2E. + if let Some(section) = self.handle_case_2c(&tx_info, &exons, sv_interval) { + Some(section) + } else if let Some(section) = self + .handle_case_2d(&tx_info, &exons, sv_interval) + .ok() + .flatten() + { + Some(section) + } else { + self.handle_case_2e(&tx_info, sv_interval) + } + }) + .collect::>(); + + // Sort by suggested score. + sections.sort_by(|a, b| { + let a = a.suggested_score(); + let b = b.suggested_score(); + a.partial_cmp(&b).unwrap_or(std::cmp::Ordering::Equal) + }); + + Ok(sections.pop()) + } else { + Ok(None) + } + } + + /// Handle case 2C: 5' end overlap. + fn handle_case_2c( + &self, + tx_info: &TxInfoRecord, + exons: &[TxExonsRecord], + sv_interval: &bio::bio_types::genome::Interval, + ) -> Option

{ + let most_lhs = exons + .iter() + .map(|exon| exon.alt_start_i - 1) + .min() + .expect("no exons?"); + let most_lhs = + bio::bio_types::genome::Locus::new(sv_interval.contig().to_string(), most_lhs as u64); + let most_rhs = exons + .iter() + .map(|exon| exon.alt_end_i) + .max() + .expect("no exons?"); + let most_rhs = + bio::bio_types::genome::Locus::new(sv_interval.contig().to_string(), most_rhs as u64); + let is_forward = exons.first().expect("no exons?").alt_strand >= 0; + + if is_forward && !sv_interval.contains(most_lhs.clone()) // forward: 5' is left + // reverse: 5' is right + || !is_forward && !sv_interval.contains(most_rhs.clone()) + { + tracing::debug!( + "case 2C negative: {}, {:?}, lhs={:?}, rhs={:?}", + is_forward, + &sv_interval, + &most_lhs, + &most_rhs + ); + // is not case 2C positive + return None; + } + // Case 2C positive. + tracing::debug!("case 2C fired: {:?} contains {:?}", &sv_interval, &most_lhs); + assert!( + (!is_forward || !sv_interval.contains(most_rhs)) + && (is_forward | !sv_interval.contains(most_lhs)) + ); + if let (Some(cds_start_i), Some(cds_end_i)) = (tx_info.cds_start_i, tx_info.cds_end_i) { + let cds_interval = Interval::new( + sv_interval.contig().to_string(), + (cds_start_i as u64)..(cds_end_i as u64), + ); + let gene = self + .parent + .gene_id_data + .by_hgnc_id(&tx_info.hgnc) + .expect("no gene id info?") + .clone(); + if do_overlap(sv_interval, &cds_interval) { + // Case 2C-1 positive. + tracing::debug!("case 2C-1: {:?} vs. {:?}", sv_interval, &cds_interval); + Some(Section::L2(L2::L2C1(L2C1 { + suggested_score: 0.9, + gene, + }))) + } else { + // Case 2C-2 positive. + tracing::debug!("case 2C-2: {:?} vs. {:?}", sv_interval, &cds_interval); + Some(Section::L2(L2::L2C2(L2C2 { + suggested_score: 0.0, + gene, + }))) + } + } else { + tracing::debug!("non-coding transcript {:?}?", &tx_info.tx_ac); + None + } + } + + /// Handle case 2D: 3' end overlap. + fn handle_case_2d( + &self, + tx_info: &TxInfoRecord, + exons: &[TxExonsRecord], + sv_interval: &bio::bio_types::genome::Interval, + ) -> Result, anyhow::Error> { + let most_lhs = exons + .iter() + .map(|exon| exon.alt_start_i) + .min() + .expect("no exons?"); + let most_lhs = + bio::bio_types::genome::Locus::new(sv_interval.contig().to_string(), most_lhs as u64); + let most_rhs = exons + .iter() + .map(|exon| exon.alt_end_i - 1) + .max() + .expect("no exons?"); + let most_rhs = + bio::bio_types::genome::Locus::new(sv_interval.contig().to_string(), most_rhs as u64); + let is_forward = exons.first().expect("no exons?").alt_strand >= 0; + + if is_forward && !sv_interval.contains(most_rhs.clone()) // forward: 5' is right + || !is_forward && !sv_interval.contains(most_lhs.clone()) + { + // is not case 2D positive + return Ok(None); + } + // Case 2D positive. + tracing::debug!("case 2D fired: {:?} contains {:?}", &sv_interval, &most_rhs); + assert!( + (!is_forward || !sv_interval.contains(most_lhs)) + && (is_forward | !sv_interval.contains(most_rhs)) + ); + + if let (Some(cds_start_i), Some(cds_end_i)) = (tx_info.cds_start_i, tx_info.cds_end_i) { + // Get CDS as interval. + let cds_interval = Interval::new( + sv_interval.contig().to_string(), + (cds_start_i as u64)..(cds_end_i as u64), + ); + + let gene = self + .parent + .gene_id_data + .by_hgnc_id(&tx_info.hgnc) + .expect("no gene id info?") + .clone(); + if !do_overlap(sv_interval, &cds_interval) { + // Case 2D-1 positive: only 3' UTR is involved. + tracing::debug!("case 2D-1 fired: {:?} vs. {:?}", sv_interval, &cds_interval); + return Ok(Some(Section::L2(L2::L2D1(L2D1 { gene })))); + } + + // Get all exons that are affected and in CDS. + let affected_cds_exons = exons + .iter() + .filter(|exon| { + let exon = exon_to_interval(sv_interval.contig().to_string(), exon); + do_overlap(&exon, sv_interval) && do_overlap(&exon, &cds_interval) + }) + .collect::>(); + + assert!(!affected_cds_exons.is_empty()); + if affected_cds_exons.len() == 1 { + let clinvar_records = self + .parent + .query_clinvar_range( + sv_interval.contig(), + affected_cds_exons[0].alt_start_i as u32 + 1, + affected_cds_exons[0].alt_end_i as u32, + ) + .map_err(|e| anyhow::anyhow!("failed to query ClinVar database: {}", e))?; + // We count VCV records as established if they have two stars. + let mut established_pathogenic_vcvs = clinvar_records + .into_iter() + .filter_map(|clinvar_record| { + let clinsig = clinvar_record.reference_assertions[0].clinical_significance; + let status = clinvar_record.reference_assertions[0].review_status; + if status + <= ReviewStatus::CriteriaProvidedMultipleSubmittersNoConflicts as i32 + && clinsig <= ClinicalSignificance::LikelyPathogenic as i32 + { + Some(clinvar_record.vcv) + } else { + None + } + }) + .collect::>(); + established_pathogenic_vcvs.sort(); + if established_pathogenic_vcvs.is_empty() { + tracing::debug!("case 2D-3 fired: {:?}", affected_cds_exons); + Ok(Some(Section::L2(L2::L2D3(L2D3 { + suggested_score: 0.3, + gene, + })))) + } else { + tracing::debug!("case 2D-2 fired: {:?}", affected_cds_exons); + Ok(Some(Section::L2(L2::L2D2(L2D2 { + suggested_score: 0.9, + gene, + clinvar_ids: established_pathogenic_vcvs, + })))) + } + } else { + // Case 2D-4 positive: other exons in addition to last one involved. Nonsense- + // mediated decay is expected to occur. Note that we interpret the second + // sentence as an instruction here and not a second condition. + tracing::debug!("case 2D-4 fired: {:?}", affected_cds_exons); + Ok(Some(Section::L2(L2::L2D4(L2D4 { + suggested_score: 0.90, + gene, + exon_nos: affected_cds_exons + .iter() + .map(|exon| exon.ord as u32 + 1) + .collect::>(), + })))) + } + } else { + tracing::debug!("non-coding transcript {:?}?", &tx_info.tx_ac); + Ok(None) + } + } + + /// Handle case 2E. + fn handle_case_2e( + &self, + tx_info: &TxInfoRecord, + sv_interval: &bio::bio_types::genome::Interval, + ) -> Option
{ + let _ = tx_info; + let _ = sv_interval; + // Case 2E positive. + tracing::debug!("case 2E fired"); + tracing::warn!("PVS1 not yet implemented for case 2E TODO !!! TODO"); + + Some(Section::L2(L2::L2E(L2E { + suggested_score: 0.9, + pvs1_result: Pvs1Result::Pvs1, + }))) + } + + /// Handle case 2F: complete contained within benign region. + fn handle_case_2f( + benign_regions: &[Region], + sv_interval: &bio::bio_types::genome::Interval, + result: &[Section], + ) -> Option> { + let containing_benign_regions = benign_regions + .iter() + .filter(|clingen_region| { + let clingen_interval: Interval = (*clingen_region) + .clone() + .try_into() + .expect("no interval for region"); + contains(&clingen_interval, sv_interval) + }) + .map(|region| (*region).clone()) + .collect::>(); + if !containing_benign_regions.is_empty() { + let mut result = Vec::from_iter(result.iter().cloned()); + result.push(Section::L2(L2::L2F(L2F { + suggested_score: -1.0, + benign_regions: containing_benign_regions, + }))); + tracing::debug!("case 2F fired: {:?}", &result); + Some(result) + } else { + None + } + } + + /// Handle case 2H. + /// + /// Evaluate whether any overlapping RefSeq gene has two or more HI predictors + /// suggesting that it HI. + /// + /// We use the criterion from AutoCNV. (1) gnomAD pLI>=0.9 and upper bound of + /// observed/expected LoF confidence interval <0.35 and (2) DECIPHER HI index + /// <=10%. + fn handle_case_2h( + &self, + sv_interval: &bio::bio_types::genome::Interval, + ) -> Result, anyhow::Error> { + if let Some(alt_acc) = self.parent.chrom_to_ac.get(sv_interval.contig()) { + let txs = self + .parent + .provider + .get_tx_for_region( + alt_acc, + "splign", + sv_interval.range().start as i32, + sv_interval.range().end as i32, + ) + .map_err(|e| anyhow::anyhow!("failed to get transcripts for region: {}", e))?; + let mut hgnc_ids = txs + .iter() + .flat_map(|tx| self.parent.provider.get_tx(&tx.tx_ac).map(|tx| tx.gene_id)) + .collect::>(); + hgnc_ids.sort(); + hgnc_ids.dedup(); + + let gene_hi_predictions = hgnc_ids + .iter() + .flat_map(|hgnc_id| self.get_gene_hi_predictions(hgnc_id)) + .collect::>(); + if gene_hi_predictions.is_empty() { + Ok(None) + } else { + Ok(Some(Section::L2(L2::L2H(L2H { + suggested_score: 0.15, + gene_hi_predictions, + })))) + } + } else { + Ok(None) + } + } + + /// Obtains `GeneHiPrediction`s for the given HGNC ID. + /// + /// And checks whether the gene is predicted to be haploinsufficient by two or more + /// predictors using the criteria outlined in documentation of `Self::handle_case_2h()`. + fn get_gene_hi_predictions(&self, hgnc_id: &str) -> Option { + let hi_record = self.parent.decipher_hi_data.by_hgnc_id(hgnc_id); + let gnomad_record = self.parent.gnomad_constraint_data.by_hgnc_id(hgnc_id); + tracing::info!("HI = {:?}, gnomad = {:?}", &hi_record, &gnomad_record); + if let (Some(hi_record), Some(gnomad_record)) = (hi_record, gnomad_record) { + if let (Some(pli), Some(oe_lof_upper)) = (gnomad_record.pli, gnomad_record.oe_lof_upper) + { + if hi_record.hi_index <= 10.0 && pli >= 0.9 && oe_lof_upper < 0.35 { + Some(GeneHiPrediction { + gene: self + .parent + .gene_id_data + .by_hgnc_id(hgnc_id) + .expect("could not resolve HGNC ID") + .clone(), + results: vec![ + GeneHiPredictorResult::DecipherHiIndex { + hi_index: hi_record.hi_index, + }, + GeneHiPredictorResult::GnomadPli { + pli_score: pli, + oe_lof_upper, + }, + ], + }) + } else { + None + } + } else { + None + } + } else { + None + } + } +} + +#[cfg(test)] +mod test { + use hgvs::data::interface::Provider; + + use crate::strucvars::ds; + + use super::super::super::test::global_evaluator_37; + + /// Test the `evaluate` function with the different cases of Section 2. + /// + /// We make one test with `COL3A1` gene on forward strand and `APOB` + /// on the reverse strand. We use `COL16A1` as a gene that is unlikely HI + /// and `REV3L` as one that is only predicted as HI. `AVCRL2` has pathogenic + /// variants in the last coding exon. + /// + /// We only make checks for `APOB` for cases 2C..2E where the strand is + /// considered and important for UTR/CDS distinction. + #[tracing_test::traced_test] + #[rstest::rstest] + // Case 2A: with region ISCA-46303 + #[case("17", 67_892_996, 69_792_434, "ISCA-46303", "2A-pos")] // full + #[case("17", 67_892_996, 69_792_433, "ISCA-46303", "2A-neg-1")] // partial left + #[case("17", 67_892_997, 69_792_434, "ISCA-46303", "2A-neg-2")] // partial right + #[case("17", 67_892_997, 69_792_433, "ISCA-46303", "2A-neg-3")] + // contained + // Case 2A: with gene COL3A1 (negative cases have no overlap so other cases don't trigger) + #[case("2", 189_839_099, 189_877_472, "COL3A1", "2A-pos")] // full + #[case("2", 189_877_473, 189_878_473, "COL3A1", "2A-neg-1")] // left + #[case("2", 189_838_099, 189_839_098, "COL3A1", "2A-neg-2")] // right + // Case 2B negative (positive is handled as part of 2C..2E) + #[case("1", 32_117_848, 32_169_768, "COL16A1", "2B-neg")] // full overlap + // Case 2C-1: 5' end overlap of HI gene, with coding sequence + #[case("2", 189_839_029, 189_839_743, "COL3A1", "2C-1")] + #[case("2", 21_266_811, 21_266_952, "APOB", "2C-1")] + // Case 2C-2: 5' end overlap of HI gene, only 5' UTR + #[case("2", 189_839_029, 189_839_210, "COL3A1", "2C-2")] + #[case("2", 21_266_820, 21_266_952, "APOB", "2C-2")] + // Case 2D-1: 3' end overlap of HI gene, only 3' UTR + #[case("2", 189_876_511, 189_877_563, "COL3A1", "2D-1")] + #[case("2", 21_224_301, 21_224_585, "APOB", "2D-1")] + // Case 2D-2: 3' end overlap of HI gene, last exon only, pathogenic vars + #[case("12", 52_314_363, 52_317_600, "AVCRL2", "2D-2")] + // Case 2D-3: 3' end overlap of HI gene, last exon only, NO pathogenic vars + #[case("2", 189_875_865, 189_877_563, "COL3A1", "2D-3")] + #[case("2", 21_224_301, 21_226_760, "APOB", "2D-3")] + // Case 2D-4: 3' end overlap of HI gene, more than last exon + #[case("2", 189_875_551, 189_877_563, "COL3A1", "2D-4")] + #[case("2", 21_224_301, 21_227_294, "APOB", "2D-4")] + // Case 2E: intragenic variant, both breakpoints in same gene + // TODO: need more tests once VPS1 has been implemented + #[case("2", 189_839_528, 189_876_186, "COL3A1", "2E")] + #[case("2", 21_226_513, 21_266_591, "APOB", "2E")] + // Case 2F: completely contained in benign region + #[case("1", 12_989_199, 12_998_900, "ISCA-46311", "2F-pos")] // contained + #[case("1", 12_998_901, 12_998_901, "ISCA-46311", "2F-neg-1")] // right of + #[case("1", 12_989_198, 12_989_198, "ISCA-46311", "2F-neg-2")] // left of + // Case 2G: overlapping benign region, but additional material + #[case("1", 12_989_199, 12_998_901, "ISCA-46311", "2G-pos-1")] // additional right + #[case("1", 12_989_198, 12_998_900, "ISCA-46311", "2G-pos-2")] + // additional left + // Case 2H: two or more HI predictors suggest AT LEAST ONE gene in the interval is HI + #[case("1", 12_040_238, 12_073_572, "MFN2", "2H-neg")] // MFN2 not DECIPHER HI + #[case("6", 111_620_236, 111_804_914, "REV3L", "2H-pos-1")] // REV3L, contained + #[case("6", 111_620_236, 111_620_236, "REV3L", "2H-pos-2")] // REV3L, 1bp left + #[case("6", 111_804_914, 111_804_914, "REV3L", "2H-pos-3")] // REV3L, 1bp right + #[case("6", 111_620_235, 111_620_235, "REV3L", "2H-neg-1")] // REV3L, left of + #[case("6", 111_804_915, 111_804_915, "REV3L", "2H-neg-2")] // REV3L, right of + fn evaluate( + #[case] chrom: &str, + #[case] start: u64, + #[case] stop: u64, + #[case] gene: &str, + #[case] label: &str, + global_evaluator_37: super::super::super::Evaluator, + ) -> Result<(), anyhow::Error> { + mehari::common::set_snapshot_suffix!("{}-{}", gene, label); + + let evaluator = super::Evaluator::with_parent(&global_evaluator_37); + let strucvar = ds::StructuralVariant { + chrom: chrom.into(), + start: start as u32, + stop: stop as u32, + svtype: ds::SvType::Del, + ambiguous_range: None, + }; + + let res = evaluator.evaluate(&strucvar)?; + insta::assert_yaml_snapshot!(res); + + Ok(()) + } + + /// Test internal working of `clingen_overlaps`. + #[tracing_test::traced_test] + #[rstest::rstest] + #[case("17", 67_892_000, 69_793_000, "region-match")] + #[case("X", 152_990_000, 153_011_000, "gene-abcd1")] + fn clingen_overlaps( + #[case] chrom: &str, + #[case] start: u64, + #[case] stop: u64, + #[case] label: &str, + global_evaluator_37: super::super::super::Evaluator, + ) -> Result<(), anyhow::Error> { + mehari::common::set_snapshot_suffix!("{}", label); + + let sv_interval = super::Interval::new(chrom.into(), start..stop); + let evaluator = super::Evaluator::with_parent(&global_evaluator_37); + + let (genes, regions) = evaluator.clingen_overlaps(&sv_interval); + + insta::assert_yaml_snapshot!(genes); + insta::assert_yaml_snapshot!(regions); + + Ok(()) + } + + /// Test inernal working of `handle_case_2a` (complete overlap). + #[tracing_test::traced_test] + #[rstest::rstest] + // Note: same cases as in `evaluate` above -- keep in sync! + #[case("17", 67_892_996, 69_792_434, "ISCA-46303", "2A-pos")] + #[case("2", 189_839_099, 189_877_472, "COL3A1", "2A-pos")] + fn handle_case_2a( + #[case] chrom: &str, + #[case] start: u64, + #[case] stop: u64, + #[case] hgnc_id: &str, + #[case] label: &str, + global_evaluator_37: super::super::super::Evaluator, + ) -> Result<(), anyhow::Error> { + mehari::common::set_snapshot_suffix!("{}-{}", hgnc_id, label); + + let strucvar = ds::StructuralVariant { + chrom: chrom.into(), + start: start as u32, + stop: stop as u32, + svtype: ds::SvType::Del, + ambiguous_range: None, + }; + let sv_interval = strucvar.into(); + let genes = global_evaluator_37 + .clingen_dosage_data + .gene_by_overlap(&sv_interval); + let regions = global_evaluator_37 + .clingen_dosage_data + .region_by_overlap(&sv_interval); + + let res = super::Evaluator::handle_case_2a(&genes, ®ions, &sv_interval); + insta::assert_yaml_snapshot!(res); + + Ok(()) + } + + /// Test internal workings of `handle_cases_2c_2e` (5', 3' overlap, intragenic). + #[tracing_test::traced_test] + #[rstest::rstest] + // Note: same cases as in `evaluate` above -- keep in sync! + // Case 2C-1: 5' end overlap of HI gene, with coding sequence + #[case("2", 189_839_029, 189_839_743, "COL3A1", "2C-1")] + #[case("2", 21_266_811, 21_266_952, "APOB", "2C-1")] + // Case 2C-2: 5' end overlap of HI gene, only 5' UTR + #[case("2", 189_839_029, 189_839_210, "COL3A1", "2C-2")] + #[case("2", 21_266_820, 21_266_952, "APOB", "2C-2")] + // Case 2D-1: 3' end overlap of HI gene, only 3' UTR + #[case("2", 189_876_511, 189_877_563, "COL3A1", "2D-1")] + #[case("2", 21_224_301, 21_224_585, "APOB", "2D-1")] + // Case 2D-2: 3' end overlap of HI gene, last exon only, pathogenic vars + #[case("12", 52_314_363, 52_317_600, "AVCRL2", "2D-2")] + // Case 2D-3: 3' end overlap of HI gene, last exon only, NO pathogenic vars + #[case("2", 189_875_865, 189_877_563, "COL3A1", "2D-3")] + #[case("2", 21_224_301, 21_226_760, "APOB", "2D-3")] + // Case 2D-4: 3' end overlap of HI gene, more than last exon + #[case("2", 189_875_551, 189_877_563, "COL3A1", "2D-4")] + #[case("2", 21_224_301, 21_227_294, "APOB", "2D-4")] + // Case 2E: intragenic variant, both breakpoints in same gene + // TODO: need more tests once VPS1 has been implemented + #[case("2", 189_839_528, 189_876_186, "COL3A1", "2E")] + #[case("2", 21_226_513, 21_266_591, "APOB", "2E")] + fn handle_cases_2c_2e( + #[case] chrom: &str, + #[case] start: u64, + #[case] stop: u64, + #[case] hgnc_id: &str, + #[case] label: &str, + global_evaluator_37: super::super::super::Evaluator, + ) -> Result<(), anyhow::Error> { + mehari::common::set_snapshot_suffix!("{}-{}", hgnc_id, label); + + let strucvar = ds::StructuralVariant { + chrom: chrom.into(), + start: start as u32, + stop: stop as u32, + svtype: ds::SvType::Del, + ambiguous_range: None, + }; + let sv_interval = strucvar.into(); + let evaluator = super::Evaluator::with_parent(&global_evaluator_37); + let (genes, _) = evaluator.clingen_overlaps(&sv_interval); + + let res = evaluator.handle_cases_2c_2e(&sv_interval, &genes)?; + insta::assert_yaml_snapshot!(res); + + Ok(()) + } + + /// Test internal workings of `handle_case_2c` (5' end overlap from upstream). + #[tracing_test::traced_test] + #[rstest::rstest] + // Note: same cases as in `evaluate` above -- keep in sync! + // Case 2C-1: 5' end overlap of HI gene, with coding sequence + #[case("2", 189_839_029, 189_839_743, "COL3A1", "2C-1")] + #[case("2", 21_266_811, 21_266_952, "APOB", "2C-1")] + // Case 2C-2: 5' end overlap of HI gene, only 5' UTR + #[case("2", 189_839_029, 189_839_210, "COL3A1", "2C-2")] + #[case("2", 21_266_820, 21_266_952, "APOB", "2C-2")] + fn handle_case_2c( + #[case] chrom: &str, + #[case] start: u64, + #[case] stop: u64, + #[case] hgnc_id: &str, + #[case] label: &str, + global_evaluator_37: super::super::super::Evaluator, + ) -> Result<(), anyhow::Error> { + mehari::common::set_snapshot_suffix!("{}-{}", hgnc_id, label); + + let strucvar = ds::StructuralVariant { + chrom: chrom.into(), + start: start as u32, + stop: stop as u32, + svtype: ds::SvType::Del, + ambiguous_range: None, + }; + let sv_interval = strucvar.into(); + + let tx_id = if hgnc_id == "COL3A1" { + "NM_000090.4" + } else { + "NM_000384.3" + }; + + let tx_info = global_evaluator_37 + .provider + .get_tx_info(tx_id, "NC_000002.11", "splign")?; + let exons = global_evaluator_37 + .provider + .get_tx_exons(tx_id, "NC_000002.11", "splign")?; + let evaluator = super::Evaluator::with_parent(&global_evaluator_37); + + let res = evaluator.handle_case_2c(&tx_info, &exons, &sv_interval); + insta::assert_yaml_snapshot!(res); + + Ok(()) + } + + /// Test internal workings of `handle_case_2d` (3' end overlap from downstream). + #[tracing_test::traced_test] + #[rstest::rstest] + // Note: same cases as in `evaluate` above -- keep in sync! + // Case 2D-1: 3' end overlap of HI gene, only 3' UTR + #[case("2", 189_876_511, 189_877_563, "COL3A1", "2D-1")] + #[case("2", 21_224_301, 21_224_585, "APOB", "2D-1")] + // Case 2D-2: 3' end overlap of HI gene, last exon only, pathogenic vars + // TODO: need to find example with pathogenic variants + // Case 2D-3: 3' end overlap of HI gene, last exon only, NO pathogenic vars + #[case("2", 189_875_865, 189_877_563, "COL3A1", "2D-3")] + #[case("2", 21_224_301, 21_226_760, "APOB", "2D-3")] + // Case 2D-4: 3' end overlap of HI gene, more than last exon + #[case("2", 189_875_551, 189_877_563, "COL3A1", "2D-4")] + #[case("2", 21_224_301, 21_227_294, "APOB", "2D-4")] + fn handle_case_2d( + #[case] chrom: &str, + #[case] start: u64, + #[case] stop: u64, + #[case] hgnc_id: &str, + #[case] label: &str, + global_evaluator_37: super::super::super::Evaluator, + ) -> Result<(), anyhow::Error> { + mehari::common::set_snapshot_suffix!("{}", label); + + let strucvar = ds::StructuralVariant { + chrom: chrom.into(), + start: start as u32, + stop: stop as u32, + svtype: ds::SvType::Del, + ambiguous_range: None, + }; + let sv_interval = strucvar.into(); + + let tx_id = if hgnc_id == "COL3A1" { + "NM_000090.4" + } else { + "NM_000384.3" + }; + + let tx_info = global_evaluator_37 + .provider + .get_tx_info(tx_id, "NC_000002.11", "splign")?; + let exons = global_evaluator_37 + .provider + .get_tx_exons(tx_id, "NC_000002.11", "splign")?; + let evaluator = super::Evaluator::with_parent(&global_evaluator_37); + + let res = evaluator.handle_case_2d(&tx_info, &exons, &sv_interval)?; + + insta::assert_yaml_snapshot!(res); + + Ok(()) + } + + /// Test internal workings of `handle_case_2e` (2E positive case). + #[tracing_test::traced_test] + #[rstest::rstest] + // Note: same cases as in `evaluate` above -- keep in sync! + // Case 2E: intragenic variant, both breakpoints in same gene + // TODO: need more tests once VPS1 has been implemented + #[case("2", 189_839_528, 189_876_186, "COL3A1", "2E")] + #[case("2", 21_226_513, 21_266_591, "APOB", "2E")] + fn handle_case_2e( + #[case] chrom: &str, + #[case] start: u64, + #[case] stop: u64, + #[case] hgnc_id: &str, + #[case] label: &str, + global_evaluator_37: super::super::super::Evaluator, + ) -> Result<(), anyhow::Error> { + mehari::common::set_snapshot_suffix!("{}-{}", hgnc_id, label); + + let strucvar = ds::StructuralVariant { + chrom: chrom.into(), + start: start as u32, + stop: stop as u32, + svtype: ds::SvType::Del, + ambiguous_range: None, + }; + let sv_interval = strucvar.into(); + + let tx_id = if hgnc_id == "COL3A1" { + "NM_000090.4" + } else { + "NM_000384.3" + }; + + let tx_info = global_evaluator_37 + .provider + .get_tx_info(tx_id, "NC_000002.11", "splign")?; + let evaluator = super::Evaluator::with_parent(&global_evaluator_37); + + let section = evaluator.handle_case_2e(&tx_info, &sv_interval); + insta::assert_yaml_snapshot!(section); + + Ok(()) + } + + /// Test internal workings of `handle_case_2f` + #[tracing_test::traced_test] + #[rstest::rstest] + // Note: same cases as in `evaluate` above -- keep in sync! + // Case 2F: completely contained in benign region + #[case("1", 12_989_199, 12_998_900, "ISCA-46311", "2F-pos")] // contained + #[case("1", 12_998_901, 12_998_901, "ISCA-46311", "2F-neg-1")] // right of + #[case("1", 12_989_198, 12_989_198, "ISCA-46311", "2F-neg-2")] // left of + fn handle_case_2f( + #[case] chrom: &str, + #[case] start: u64, + #[case] stop: u64, + #[case] hgnc_id: &str, + #[case] label: &str, + global_evaluator_37: super::super::super::Evaluator, + ) -> Result<(), anyhow::Error> { + mehari::common::set_snapshot_suffix!("{}-{}", hgnc_id, label); + + let strucvar = ds::StructuralVariant { + chrom: chrom.into(), + start: start as u32, + stop: stop as u32, + svtype: ds::SvType::Del, + ambiguous_range: None, + }; + let sv_interval = strucvar.into(); + + let evaluator = super::Evaluator::with_parent(&global_evaluator_37); + let (gene_overlaps, region_overlaps) = evaluator.clingen_overlaps(&sv_interval); + + insta::assert_yaml_snapshot!(&gene_overlaps); + insta::assert_yaml_snapshot!(®ion_overlaps); + + let res = super::Evaluator::handle_case_2f(®ion_overlaps, &sv_interval, &[]); + assert_eq!(res.is_some(), label.starts_with("2F-pos")); + insta::assert_yaml_snapshot!(res); + + Ok(()) + } + + /// Test internal workings of `handle_case_2h`. + #[tracing_test::traced_test] + #[rstest::rstest] + // Note: same cases as in `evaluate` above -- keep in sync! + // Case 2H: two or more HI predictors suggest AT LEAST ONE gene in the interval is HI + #[case("1", 12_040_238, 12_073_572, "MFN2", "2H-neg")] // MFN2 not DECIPHER HI + #[case("6", 111_620_236, 111_804_914, "REV3L", "2H-pos-1")] // REV3L, contained + #[case("6", 111_620_236, 111_620_236, "REV3L", "2H-pos-2")] // REV3L, 1bp left + #[case("6", 111_804_914, 111_804_914, "REV3L", "2H-pos-3")] // REV3L, 1bp right + #[case("6", 111_620_235, 111_620_235, "REV3L", "2H-neg-1")] // REV3L, left of + #[case("6", 111_804_915, 111_804_915, "REV3L", "2H-neg-2")] // REV3L, right of + fn handle_case_2h( + #[case] chrom: &str, + #[case] start: u64, + #[case] stop: u64, + #[case] hgnc_id: &str, + #[case] label: &str, + global_evaluator_37: super::super::super::Evaluator, + ) -> Result<(), anyhow::Error> { + mehari::common::set_snapshot_suffix!("{}-{}", hgnc_id, label); + + let strucvar = ds::StructuralVariant { + chrom: chrom.into(), + start: start as u32, + stop: stop as u32, + svtype: ds::SvType::Del, + ambiguous_range: None, + }; + let sv_interval = strucvar.into(); + + let evaluator = super::Evaluator::with_parent(&global_evaluator_37); + + let res = evaluator.handle_case_2h(&sv_interval)?; + insta::assert_yaml_snapshot!(res); + + Ok(()) + } + + /// Test internal workings of `get_gene_hi_predictions()`.` + #[tracing_test::traced_test] + #[rstest::rstest] + fn get_gene_hi_predictions( + global_evaluator_37: super::super::super::Evaluator, + ) -> Result<(), anyhow::Error> { + let evaluator = super::Evaluator::with_parent(&global_evaluator_37); + + // MFN2 is not in DECIPHER HI. + let res_mfn2 = evaluator.get_gene_hi_predictions("HGNC:16877"); + assert!(res_mfn2.is_none()); + + // REV3L is in DECIPHER HI. + let res_rev3l = evaluator.get_gene_hi_predictions("HGNC:9968"); + insta::assert_yaml_snapshot!(res_rev3l); + + // RERE is in DECIPHER HI. + let res_rere = evaluator.get_gene_hi_predictions("HGNC:9965"); + insta::assert_yaml_snapshot!(res_rere); + + Ok(()) + } +} diff --git a/src/strucvars/eval/del/section3.rs b/src/strucvars/eval/del/section3.rs new file mode 100644 index 0000000..12476fa --- /dev/null +++ b/src/strucvars/eval/del/section3.rs @@ -0,0 +1,132 @@ +//! Implementation of evaluation of copy number loss section 3. + +use super::result::Section; +use crate::strucvars::eval::del::result::{L3Count, L1, L1A, L3}; + +/// Evaluation of deletions, loss of copy number. +/// +/// This is mainly used to encapsulate the functionality. Creating new such +/// objects is very straightforward and cheap. +#[derive(Default)] +pub struct Evaluator {} + +impl Evaluator { + /// Create a new `Evaluator`. + pub fn new() -> Self { + Self {} + } + + /// Perform the evaluation of copy number loss Section 3 and all subsection. + /// + /// # Arguments + /// + /// * `l1` - Result of section 1. + /// + /// # Returns + /// + /// The evaluation result. + /// + /// # Errors + /// + /// If anything goes wrong, it returns a generic `anyhow::Error`. + pub fn evaluate(&self, l1: &Section) -> Result { + tracing::debug!("re-evaluating Section 1 results for Section 3 for {:?}", l1); + let result = match l1 { + Section::L1(L1::L1A(L1A { genes })) => { + if genes.len() < 25 { + Section::L3(L3::L3A(L3Count { + suggested_score: 0.0, + num_genes: genes.len(), + genes: genes.clone(), + })) + } else if genes.len() < 35 { + Section::L3(L3::L3A(L3Count { + suggested_score: 0.45, + num_genes: genes.len(), + genes: genes.clone(), + })) + } else { + Section::L3(L3::L3A(L3Count { + suggested_score: 0.9, + num_genes: genes.len(), + genes: genes.clone(), + })) + } + } + Section::L1(L1::L1B(_)) => Section::L3(L3::L3A(L3Count { + suggested_score: 0.0, + num_genes: 0, + genes: Vec::new(), + })), + _ => anyhow::bail!("unexpected Section 1 result: {:?}", l1), + }; + + tracing::debug!("result = {:?}", &result); + + Ok(result) + } +} + +#[cfg(test)] +mod test { + use crate::strucvars::data::hgnc::GeneIdInfo; + use crate::strucvars::eval::del::result::{GeneOverlap, L3Count, Section, L1, L1A, L1B, L3}; + + // #[tracing_test::traced_test] + #[rstest::rstest] + #[case(0, 0.0)] + #[case(1, 0.0)] + #[case(24, 0.0)] + #[case(25, 0.45)] + #[case(34, 0.45)] + #[case(35, 0.9)] + fn test_evaluate_l1a( + #[case] n_genes: usize, + #[case] expected_score: f32, + ) -> Result<(), anyhow::Error> { + mehari::common::set_snapshot_suffix!("{}", n_genes); + + let genes: Vec = (0..n_genes) + .map(|_| GeneOverlap { + gene: GeneIdInfo::default(), + tx_ids: vec![String::from("fake-tx-id")], + }) + .collect(); + + let evaluator = super::Evaluator::new(); + + let result = evaluator.evaluate(&Section::L1(L1::L1A(L1A { genes })))?; + match result { + Section::L3(L3::L3A(L3Count { + suggested_score, + num_genes, + .. + })) + | Section::L3(L3::L3B(L3Count { + suggested_score, + num_genes, + .. + })) + | Section::L3(L3::L3C(L3Count { + suggested_score, + num_genes, + .. + })) => { + assert_eq!(expected_score, suggested_score); + assert_eq!(n_genes, num_genes); + } + _ => panic!("invalid section {:?}", &result), + } + + Ok(()) + } + + // #[tracing_test::traced_test] + #[rstest::rstest] + fn test_evaluate_l1b() -> Result<(), anyhow::Error> { + let evaluator = super::Evaluator::new(); + insta::assert_yaml_snapshot!(&evaluator.evaluate(&Section::L1(L1::L1B(L1B {})))?); + + Ok(()) + } +} diff --git a/src/strucvars/eval/del/section4.rs b/src/strucvars/eval/del/section4.rs new file mode 100644 index 0000000..1ee1f96 --- /dev/null +++ b/src/strucvars/eval/del/section4.rs @@ -0,0 +1,67 @@ +//! Implementation of evaluation of copy number loss section 4. +//! +//! Note that we can only determine 4O reliably in an automated fashion. We support +//! reporting overlapping variants from ClinVar but leave them as "dangle" because a +//! human must evaluate the phenotype. + +use super::result::Section; +use crate::strucvars::ds::StructuralVariant; + +/// Evaluation of deletions, loss of copy number. +/// +/// This is mainly used to encapsulate the functionality. Creating new such +/// objects is very straightforward and cheap. +pub struct Evaluator<'a> { + /// The parent evaluator. + #[allow(dead_code)] // TODO + parent: &'a super::super::Evaluator, +} + +impl<'a> Evaluator<'a> { + /// Create a new `Evaluator`. + pub fn with_parent(parent: &'a super::super::Evaluator) -> Self { + Self { parent } + } + + /// Perform the evaluation of copy number loss Section 3 and all subsection. + /// + /// # Arguments + /// + /// * `strucvar` - Structural variant to be evaluated. + /// + /// # Returns + /// + /// The evaluation result. + /// + /// # Errors + /// + /// If anything goes wrong, it returns a generic `anyhow::Error`. + pub fn evaluate(&self, strucvar: &StructuralVariant) -> Result, anyhow::Error> { + let _ = strucvar; + tracing::warn!("Section 4 evaluation not implemented yet"); + Ok(Vec::new()) + } +} + +#[cfg(test)] +mod test { + use crate::strucvars::ds::StructuralVariant; + + use super::super::super::test::global_evaluator_37; + + // #[tracing_test::traced_test] + #[rstest::rstest] + fn test_evaluate( + global_evaluator_37: super::super::super::Evaluator, + ) -> Result<(), anyhow::Error> { + let evaluator = super::Evaluator::with_parent(&global_evaluator_37); + + // TODO: write test after we have an actual implementation + assert_eq!( + evaluator.evaluate(&StructuralVariant::default())?, + Vec::new() + ); + + Ok(()) + } +} diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__clingen_overlaps@gene-abcd1-2.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__clingen_overlaps@gene-abcd1-2.snap new file mode 100644 index 0000000..1bdff9e --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__clingen_overlaps@gene-abcd1-2.snap @@ -0,0 +1,6 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: regions +--- +[] + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__clingen_overlaps@gene-abcd1.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__clingen_overlaps@gene-abcd1.snap new file mode 100644 index 0000000..d0b0505 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__clingen_overlaps@gene-abcd1.snap @@ -0,0 +1,19 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: genes +--- +- gene_symbol: BCAP31 + ncbi_gene_id: "10134" + genomic_location: "X:152965947-152990201" + haploinsufficiency_score: SufficientEvidence + triplosensitivity_score: NoEvidenceAvailable + haploinsufficiency_disease_id: "MONDO:0010334" + triplosensitivity_disease_id: ~ +- gene_symbol: ABCD1 + ncbi_gene_id: "215" + genomic_location: "X:152990323-153010216" + haploinsufficiency_score: SufficientEvidence + triplosensitivity_score: NoEvidenceAvailable + haploinsufficiency_disease_id: "MONDO:0018544" + triplosensitivity_disease_id: ~ + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__clingen_overlaps@region-match-2.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__clingen_overlaps@region-match-2.snap new file mode 100644 index 0000000..09663ca --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__clingen_overlaps@region-match-2.snap @@ -0,0 +1,12 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: regions +--- +- isca_id: ISCA-46303 + isca_region_name: SOX9 upstream enhancer region + genomic_location: "17:67892996-69792434" + haploinsufficiency_score: SufficientEvidence + triplosensitivity_score: LittleEvidence + haploinsufficiency_disease_id: "MONDO:0007134" + triplosensitivity_disease_id: "MONDO:0009869" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__clingen_overlaps@region-match.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__clingen_overlaps@region-match.snap new file mode 100644 index 0000000..174d2c2 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__clingen_overlaps@region-match.snap @@ -0,0 +1,6 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: genes +--- +[] + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@APOB-2C-1.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@APOB-2C-1.snap new file mode 100644 index 0000000..e0b4b98 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@APOB-2C-1.snap @@ -0,0 +1,15 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +- L2: + L2B: {} +- L2: + L2C1: + suggested_score: 0.9 + gene: + hgnc_id: "HGNC:603" + hgnc_symbol: APOB + ensembl_gene_id: ENSG00000084674 + ncbi_gene_id: "338" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@APOB-2C-2.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@APOB-2C-2.snap new file mode 100644 index 0000000..7f0366e --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@APOB-2C-2.snap @@ -0,0 +1,15 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +- L2: + L2B: {} +- L2: + L2C2: + suggested_score: 0 + gene: + hgnc_id: "HGNC:603" + hgnc_symbol: APOB + ensembl_gene_id: ENSG00000084674 + ncbi_gene_id: "338" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@APOB-2D-1.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@APOB-2D-1.snap new file mode 100644 index 0000000..7c78071 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@APOB-2D-1.snap @@ -0,0 +1,14 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +- L2: + L2B: {} +- L2: + L2D1: + gene: + hgnc_id: "HGNC:603" + hgnc_symbol: APOB + ensembl_gene_id: ENSG00000084674 + ncbi_gene_id: "338" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@APOB-2D-3.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@APOB-2D-3.snap new file mode 100644 index 0000000..314bf01 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@APOB-2D-3.snap @@ -0,0 +1,15 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +- L2: + L2B: {} +- L2: + L2D3: + suggested_score: 0.3 + gene: + hgnc_id: "HGNC:603" + hgnc_symbol: APOB + ensembl_gene_id: ENSG00000084674 + ncbi_gene_id: "338" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@APOB-2D-4.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@APOB-2D-4.snap new file mode 100644 index 0000000..346d127 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@APOB-2D-4.snap @@ -0,0 +1,18 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +- L2: + L2B: {} +- L2: + L2D4: + suggested_score: 0.9 + gene: + hgnc_id: "HGNC:603" + hgnc_symbol: APOB + ensembl_gene_id: ENSG00000084674 + ncbi_gene_id: "338" + exon_nos: + - 29 + - 28 + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@APOB-2E.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@APOB-2E.snap new file mode 100644 index 0000000..2030dc3 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@APOB-2E.snap @@ -0,0 +1,11 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +- L2: + L2B: {} +- L2: + L2E: + suggested_score: 0.9 + pvs1_result: Pvs1 + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@AVCRL2-2D-2.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@AVCRL2-2D-2.snap new file mode 100644 index 0000000..75aac4f --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@AVCRL2-2D-2.snap @@ -0,0 +1,20 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +- L2: + L2B: {} +- L2: + L2D2: + suggested_score: 0.9 + gene: + hgnc_id: "HGNC:175" + hgnc_symbol: ACVRL1 + ensembl_gene_id: ENSG00000139567 + ncbi_gene_id: "94" + clinvar_ids: + - VCV000008252 + - VCV000212796 + - VCV000372722 + - VCV000426035 + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL16A1-2B-neg.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL16A1-2B-neg.snap new file mode 100644 index 0000000..898c311 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL16A1-2B-neg.snap @@ -0,0 +1,6 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +[] + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2A-neg-1.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2A-neg-1.snap new file mode 100644 index 0000000..898c311 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2A-neg-1.snap @@ -0,0 +1,6 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +[] + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2A-neg-2.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2A-neg-2.snap new file mode 100644 index 0000000..898c311 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2A-neg-2.snap @@ -0,0 +1,6 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +[] + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2A-pos.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2A-pos.snap new file mode 100644 index 0000000..8126367 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2A-pos.snap @@ -0,0 +1,17 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +- L2: + L2A: + suggested_score: 1 + hi_genes: + - gene_symbol: COL3A1 + ncbi_gene_id: "1281" + genomic_location: "2:189839099-189877472" + haploinsufficiency_score: SufficientEvidence + triplosensitivity_score: NoEvidenceAvailable + haploinsufficiency_disease_id: "MONDO:0007524" + triplosensitivity_disease_id: ~ + hi_regions: [] + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2C-1.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2C-1.snap new file mode 100644 index 0000000..b58a6be --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2C-1.snap @@ -0,0 +1,15 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +- L2: + L2B: {} +- L2: + L2C1: + suggested_score: 0.9 + gene: + hgnc_id: "HGNC:2201" + hgnc_symbol: COL3A1 + ensembl_gene_id: ENSG00000168542 + ncbi_gene_id: "1281" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2C-2.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2C-2.snap new file mode 100644 index 0000000..de3fd6a --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2C-2.snap @@ -0,0 +1,15 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +- L2: + L2B: {} +- L2: + L2C2: + suggested_score: 0 + gene: + hgnc_id: "HGNC:2201" + hgnc_symbol: COL3A1 + ensembl_gene_id: ENSG00000168542 + ncbi_gene_id: "1281" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2D-1.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2D-1.snap new file mode 100644 index 0000000..317ccf6 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2D-1.snap @@ -0,0 +1,14 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +- L2: + L2B: {} +- L2: + L2D1: + gene: + hgnc_id: "HGNC:2201" + hgnc_symbol: COL3A1 + ensembl_gene_id: ENSG00000168542 + ncbi_gene_id: "1281" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2D-3.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2D-3.snap new file mode 100644 index 0000000..8da4dee --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2D-3.snap @@ -0,0 +1,15 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +- L2: + L2B: {} +- L2: + L2D3: + suggested_score: 0.3 + gene: + hgnc_id: "HGNC:2201" + hgnc_symbol: COL3A1 + ensembl_gene_id: ENSG00000168542 + ncbi_gene_id: "1281" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2D-4.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2D-4.snap new file mode 100644 index 0000000..756076e --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2D-4.snap @@ -0,0 +1,18 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +- L2: + L2B: {} +- L2: + L2D4: + suggested_score: 0.9 + gene: + hgnc_id: "HGNC:2201" + hgnc_symbol: COL3A1 + ensembl_gene_id: ENSG00000168542 + ncbi_gene_id: "1281" + exon_nos: + - 50 + - 51 + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2E.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2E.snap new file mode 100644 index 0000000..2030dc3 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@COL3A1-2E.snap @@ -0,0 +1,11 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +- L2: + L2B: {} +- L2: + L2E: + suggested_score: 0.9 + pvs1_result: Pvs1 + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46303-2A-neg-1.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46303-2A-neg-1.snap new file mode 100644 index 0000000..898c311 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46303-2A-neg-1.snap @@ -0,0 +1,6 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +[] + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46303-2A-neg-2.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46303-2A-neg-2.snap new file mode 100644 index 0000000..898c311 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46303-2A-neg-2.snap @@ -0,0 +1,6 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +[] + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46303-2A-neg-3.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46303-2A-neg-3.snap new file mode 100644 index 0000000..898c311 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46303-2A-neg-3.snap @@ -0,0 +1,6 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +[] + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46303-2A-pos.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46303-2A-pos.snap new file mode 100644 index 0000000..e0cb4df --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46303-2A-pos.snap @@ -0,0 +1,17 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +- L2: + L2A: + suggested_score: 1 + hi_genes: [] + hi_regions: + - isca_id: ISCA-46303 + isca_region_name: SOX9 upstream enhancer region + genomic_location: "17:67892996-69792434" + haploinsufficiency_score: SufficientEvidence + triplosensitivity_score: LittleEvidence + haploinsufficiency_disease_id: "MONDO:0007134" + triplosensitivity_disease_id: "MONDO:0009869" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46311-2F-neg-1.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46311-2F-neg-1.snap new file mode 100644 index 0000000..898c311 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46311-2F-neg-1.snap @@ -0,0 +1,6 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +[] + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46311-2F-neg-2.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46311-2F-neg-2.snap new file mode 100644 index 0000000..898c311 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46311-2F-neg-2.snap @@ -0,0 +1,6 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +[] + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46311-2F-pos.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46311-2F-pos.snap new file mode 100644 index 0000000..5065170 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46311-2F-pos.snap @@ -0,0 +1,16 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +- L2: + L2F: + suggested_score: -1 + benign_regions: + - isca_id: ISCA-46311 + isca_region_name: 1p36.21 population region (gnomAD-SV_v2.1_DEL_1_1094) + genomic_location: "1:12989199-12998900" + haploinsufficiency_score: DosageSensitivityUnlikely + triplosensitivity_score: NoEvidenceAvailable + haploinsufficiency_disease_id: ~ + triplosensitivity_disease_id: ~ + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46311-2G-pos-1.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46311-2G-pos-1.snap new file mode 100644 index 0000000..86c4b43 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46311-2G-pos-1.snap @@ -0,0 +1,16 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +- L2: + L2G: + suggested_score: 0 + benign_regions: + - isca_id: ISCA-46311 + isca_region_name: 1p36.21 population region (gnomAD-SV_v2.1_DEL_1_1094) + genomic_location: "1:12989199-12998900" + haploinsufficiency_score: DosageSensitivityUnlikely + triplosensitivity_score: NoEvidenceAvailable + haploinsufficiency_disease_id: ~ + triplosensitivity_disease_id: ~ + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46311-2G-pos-2.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46311-2G-pos-2.snap new file mode 100644 index 0000000..86c4b43 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@ISCA-46311-2G-pos-2.snap @@ -0,0 +1,16 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +- L2: + L2G: + suggested_score: 0 + benign_regions: + - isca_id: ISCA-46311 + isca_region_name: 1p36.21 population region (gnomAD-SV_v2.1_DEL_1_1094) + genomic_location: "1:12989199-12998900" + haploinsufficiency_score: DosageSensitivityUnlikely + triplosensitivity_score: NoEvidenceAvailable + haploinsufficiency_disease_id: ~ + triplosensitivity_disease_id: ~ + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@MFN2-2H-neg.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@MFN2-2H-neg.snap new file mode 100644 index 0000000..898c311 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@MFN2-2H-neg.snap @@ -0,0 +1,6 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +[] + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@REV3L-2H-neg-1.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@REV3L-2H-neg-1.snap new file mode 100644 index 0000000..898c311 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@REV3L-2H-neg-1.snap @@ -0,0 +1,6 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +[] + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@REV3L-2H-neg-2.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@REV3L-2H-neg-2.snap new file mode 100644 index 0000000..898c311 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@REV3L-2H-neg-2.snap @@ -0,0 +1,6 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +[] + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@REV3L-2H-pos-1.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@REV3L-2H-pos-1.snap new file mode 100644 index 0000000..480fcb3 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@REV3L-2H-pos-1.snap @@ -0,0 +1,20 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +- L2: + L2H: + suggested_score: 0.15 + gene_hi_predictions: + - gene: + hgnc_id: "HGNC:9968" + hgnc_symbol: REV3L + ensembl_gene_id: ENSG00000009413 + ncbi_gene_id: "5980" + results: + - DecipherHiIndex: + hi_index: 0.03 + - GnomadPli: + pli_score: 1 + oe_lof_upper: 0.117 + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@REV3L-2H-pos-2.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@REV3L-2H-pos-2.snap new file mode 100644 index 0000000..480fcb3 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@REV3L-2H-pos-2.snap @@ -0,0 +1,20 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +- L2: + L2H: + suggested_score: 0.15 + gene_hi_predictions: + - gene: + hgnc_id: "HGNC:9968" + hgnc_symbol: REV3L + ensembl_gene_id: ENSG00000009413 + ncbi_gene_id: "5980" + results: + - DecipherHiIndex: + hi_index: 0.03 + - GnomadPli: + pli_score: 1 + oe_lof_upper: 0.117 + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@REV3L-2H-pos-3.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@REV3L-2H-pos-3.snap new file mode 100644 index 0000000..480fcb3 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__evaluate@REV3L-2H-pos-3.snap @@ -0,0 +1,20 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +- L2: + L2H: + suggested_score: 0.15 + gene_hi_predictions: + - gene: + hgnc_id: "HGNC:9968" + hgnc_symbol: REV3L + ensembl_gene_id: ENSG00000009413 + ncbi_gene_id: "5980" + results: + - DecipherHiIndex: + hi_index: 0.03 + - GnomadPli: + pli_score: 1 + oe_lof_upper: 0.117 + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__get_gene_hi_predictions-2.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__get_gene_hi_predictions-2.snap new file mode 100644 index 0000000..101e932 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__get_gene_hi_predictions-2.snap @@ -0,0 +1,16 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res_rere +--- +gene: + hgnc_id: "HGNC:9965" + hgnc_symbol: RERE + ensembl_gene_id: ENSG00000142599 + ncbi_gene_id: "473" +results: + - DecipherHiIndex: + hi_index: 1.92 + - GnomadPli: + pli_score: 1 + oe_lof_upper: 0.12 + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__get_gene_hi_predictions.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__get_gene_hi_predictions.snap new file mode 100644 index 0000000..f3afe95 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__get_gene_hi_predictions.snap @@ -0,0 +1,16 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res_rev3l +--- +gene: + hgnc_id: "HGNC:9968" + hgnc_symbol: REV3L + ensembl_gene_id: ENSG00000009413 + ncbi_gene_id: "5980" +results: + - DecipherHiIndex: + hi_index: 0.03 + - GnomadPli: + pli_score: 1 + oe_lof_upper: 0.117 + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2a@COL3A1-2A-pos.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2a@COL3A1-2A-pos.snap new file mode 100644 index 0000000..8126367 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2a@COL3A1-2A-pos.snap @@ -0,0 +1,17 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +- L2: + L2A: + suggested_score: 1 + hi_genes: + - gene_symbol: COL3A1 + ncbi_gene_id: "1281" + genomic_location: "2:189839099-189877472" + haploinsufficiency_score: SufficientEvidence + triplosensitivity_score: NoEvidenceAvailable + haploinsufficiency_disease_id: "MONDO:0007524" + triplosensitivity_disease_id: ~ + hi_regions: [] + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2a@ISCA-46303-2A-pos.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2a@ISCA-46303-2A-pos.snap new file mode 100644 index 0000000..e0cb4df --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2a@ISCA-46303-2A-pos.snap @@ -0,0 +1,17 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +- L2: + L2A: + suggested_score: 1 + hi_genes: [] + hi_regions: + - isca_id: ISCA-46303 + isca_region_name: SOX9 upstream enhancer region + genomic_location: "17:67892996-69792434" + haploinsufficiency_score: SufficientEvidence + triplosensitivity_score: LittleEvidence + haploinsufficiency_disease_id: "MONDO:0007134" + triplosensitivity_disease_id: "MONDO:0009869" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2c@APOB-2C-1.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2c@APOB-2C-1.snap new file mode 100644 index 0000000..ebe99a1 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2c@APOB-2C-1.snap @@ -0,0 +1,13 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +L2: + L2C1: + suggested_score: 0.9 + gene: + hgnc_id: "HGNC:603" + hgnc_symbol: APOB + ensembl_gene_id: ENSG00000084674 + ncbi_gene_id: "338" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2c@APOB-2C-2.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2c@APOB-2C-2.snap new file mode 100644 index 0000000..74eba7a --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2c@APOB-2C-2.snap @@ -0,0 +1,13 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +L2: + L2C2: + suggested_score: 0 + gene: + hgnc_id: "HGNC:603" + hgnc_symbol: APOB + ensembl_gene_id: ENSG00000084674 + ncbi_gene_id: "338" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2c@COL3A1-2C-1.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2c@COL3A1-2C-1.snap new file mode 100644 index 0000000..809ec6f --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2c@COL3A1-2C-1.snap @@ -0,0 +1,13 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +L2: + L2C1: + suggested_score: 0.9 + gene: + hgnc_id: "HGNC:2201" + hgnc_symbol: COL3A1 + ensembl_gene_id: ENSG00000168542 + ncbi_gene_id: "1281" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2c@COL3A1-2C-2.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2c@COL3A1-2C-2.snap new file mode 100644 index 0000000..192cb1a --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2c@COL3A1-2C-2.snap @@ -0,0 +1,13 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +L2: + L2C2: + suggested_score: 0 + gene: + hgnc_id: "HGNC:2201" + hgnc_symbol: COL3A1 + ensembl_gene_id: ENSG00000168542 + ncbi_gene_id: "1281" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2d@2D-1-2.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2d@2D-1-2.snap new file mode 100644 index 0000000..6023b83 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2d@2D-1-2.snap @@ -0,0 +1,12 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +L2: + L2D1: + gene: + hgnc_id: "HGNC:603" + hgnc_symbol: APOB + ensembl_gene_id: ENSG00000084674 + ncbi_gene_id: "338" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2d@2D-1.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2d@2D-1.snap new file mode 100644 index 0000000..b9fe24b --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2d@2D-1.snap @@ -0,0 +1,12 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +L2: + L2D1: + gene: + hgnc_id: "HGNC:2201" + hgnc_symbol: COL3A1 + ensembl_gene_id: ENSG00000168542 + ncbi_gene_id: "1281" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2d@2D-3-2.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2d@2D-3-2.snap new file mode 100644 index 0000000..4a5e270 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2d@2D-3-2.snap @@ -0,0 +1,13 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +L2: + L2D3: + suggested_score: 0.3 + gene: + hgnc_id: "HGNC:603" + hgnc_symbol: APOB + ensembl_gene_id: ENSG00000084674 + ncbi_gene_id: "338" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2d@2D-3.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2d@2D-3.snap new file mode 100644 index 0000000..deac280 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2d@2D-3.snap @@ -0,0 +1,13 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +L2: + L2D3: + suggested_score: 0.3 + gene: + hgnc_id: "HGNC:2201" + hgnc_symbol: COL3A1 + ensembl_gene_id: ENSG00000168542 + ncbi_gene_id: "1281" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2d@2D-4-2.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2d@2D-4-2.snap new file mode 100644 index 0000000..582d486 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2d@2D-4-2.snap @@ -0,0 +1,16 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +L2: + L2D4: + suggested_score: 0.9 + gene: + hgnc_id: "HGNC:603" + hgnc_symbol: APOB + ensembl_gene_id: ENSG00000084674 + ncbi_gene_id: "338" + exon_nos: + - 29 + - 28 + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2d@2D-4.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2d@2D-4.snap new file mode 100644 index 0000000..c2470c7 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2d@2D-4.snap @@ -0,0 +1,16 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +L2: + L2D4: + suggested_score: 0.9 + gene: + hgnc_id: "HGNC:2201" + hgnc_symbol: COL3A1 + ensembl_gene_id: ENSG00000168542 + ncbi_gene_id: "1281" + exon_nos: + - 50 + - 51 + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2e@APOB-2E.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2e@APOB-2E.snap new file mode 100644 index 0000000..66087dc --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2e@APOB-2E.snap @@ -0,0 +1,9 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: section +--- +L2: + L2E: + suggested_score: 0.9 + pvs1_result: Pvs1 + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2e@COL3A1-2E.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2e@COL3A1-2E.snap new file mode 100644 index 0000000..66087dc --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2e@COL3A1-2E.snap @@ -0,0 +1,9 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: section +--- +L2: + L2E: + suggested_score: 0.9 + pvs1_result: Pvs1 + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-neg-1-2.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-neg-1-2.snap new file mode 100644 index 0000000..4c7b669 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-neg-1-2.snap @@ -0,0 +1,6 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: "®ion_overlaps" +--- +[] + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-neg-1-3.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-neg-1-3.snap new file mode 100644 index 0000000..f00cc35 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-neg-1-3.snap @@ -0,0 +1,6 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +~ + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-neg-1.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-neg-1.snap new file mode 100644 index 0000000..6d5374b --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-neg-1.snap @@ -0,0 +1,6 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: "&gene_overlaps" +--- +[] + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-neg-2-2.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-neg-2-2.snap new file mode 100644 index 0000000..4c7b669 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-neg-2-2.snap @@ -0,0 +1,6 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: "®ion_overlaps" +--- +[] + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-neg-2-3.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-neg-2-3.snap new file mode 100644 index 0000000..f00cc35 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-neg-2-3.snap @@ -0,0 +1,6 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +~ + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-neg-2.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-neg-2.snap new file mode 100644 index 0000000..6d5374b --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-neg-2.snap @@ -0,0 +1,6 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: "&gene_overlaps" +--- +[] + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-pos-2.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-pos-2.snap new file mode 100644 index 0000000..bee347a --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-pos-2.snap @@ -0,0 +1,12 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: "®ion_overlaps" +--- +- isca_id: ISCA-46311 + isca_region_name: 1p36.21 population region (gnomAD-SV_v2.1_DEL_1_1094) + genomic_location: "1:12989199-12998900" + haploinsufficiency_score: DosageSensitivityUnlikely + triplosensitivity_score: NoEvidenceAvailable + haploinsufficiency_disease_id: ~ + triplosensitivity_disease_id: ~ + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-pos-3.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-pos-3.snap new file mode 100644 index 0000000..5065170 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-pos-3.snap @@ -0,0 +1,16 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +- L2: + L2F: + suggested_score: -1 + benign_regions: + - isca_id: ISCA-46311 + isca_region_name: 1p36.21 population region (gnomAD-SV_v2.1_DEL_1_1094) + genomic_location: "1:12989199-12998900" + haploinsufficiency_score: DosageSensitivityUnlikely + triplosensitivity_score: NoEvidenceAvailable + haploinsufficiency_disease_id: ~ + triplosensitivity_disease_id: ~ + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-pos.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-pos.snap new file mode 100644 index 0000000..6d5374b --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2f@ISCA-46311-2F-pos.snap @@ -0,0 +1,6 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: "&gene_overlaps" +--- +[] + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2h@MFN2-2H-neg.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2h@MFN2-2H-neg.snap new file mode 100644 index 0000000..f00cc35 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2h@MFN2-2H-neg.snap @@ -0,0 +1,6 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +~ + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2h@REV3L-2H-neg-1.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2h@REV3L-2H-neg-1.snap new file mode 100644 index 0000000..f00cc35 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2h@REV3L-2H-neg-1.snap @@ -0,0 +1,6 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +~ + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2h@REV3L-2H-neg-2.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2h@REV3L-2H-neg-2.snap new file mode 100644 index 0000000..f00cc35 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2h@REV3L-2H-neg-2.snap @@ -0,0 +1,6 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +~ + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2h@REV3L-2H-pos-1.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2h@REV3L-2H-pos-1.snap new file mode 100644 index 0000000..30bdf54 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2h@REV3L-2H-pos-1.snap @@ -0,0 +1,20 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +L2: + L2H: + suggested_score: 0.15 + gene_hi_predictions: + - gene: + hgnc_id: "HGNC:9968" + hgnc_symbol: REV3L + ensembl_gene_id: ENSG00000009413 + ncbi_gene_id: "5980" + results: + - DecipherHiIndex: + hi_index: 0.03 + - GnomadPli: + pli_score: 1 + oe_lof_upper: 0.117 + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2h@REV3L-2H-pos-2.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2h@REV3L-2H-pos-2.snap new file mode 100644 index 0000000..30bdf54 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2h@REV3L-2H-pos-2.snap @@ -0,0 +1,20 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +L2: + L2H: + suggested_score: 0.15 + gene_hi_predictions: + - gene: + hgnc_id: "HGNC:9968" + hgnc_symbol: REV3L + ensembl_gene_id: ENSG00000009413 + ncbi_gene_id: "5980" + results: + - DecipherHiIndex: + hi_index: 0.03 + - GnomadPli: + pli_score: 1 + oe_lof_upper: 0.117 + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2h@REV3L-2H-pos-3.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2h@REV3L-2H-pos-3.snap new file mode 100644 index 0000000..30bdf54 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_case_2h@REV3L-2H-pos-3.snap @@ -0,0 +1,20 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +L2: + L2H: + suggested_score: 0.15 + gene_hi_predictions: + - gene: + hgnc_id: "HGNC:9968" + hgnc_symbol: REV3L + ensembl_gene_id: ENSG00000009413 + ncbi_gene_id: "5980" + results: + - DecipherHiIndex: + hi_index: 0.03 + - GnomadPli: + pli_score: 1 + oe_lof_upper: 0.117 + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@APOB-2C-1.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@APOB-2C-1.snap new file mode 100644 index 0000000..ebe99a1 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@APOB-2C-1.snap @@ -0,0 +1,13 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +L2: + L2C1: + suggested_score: 0.9 + gene: + hgnc_id: "HGNC:603" + hgnc_symbol: APOB + ensembl_gene_id: ENSG00000084674 + ncbi_gene_id: "338" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@APOB-2C-2.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@APOB-2C-2.snap new file mode 100644 index 0000000..74eba7a --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@APOB-2C-2.snap @@ -0,0 +1,13 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +L2: + L2C2: + suggested_score: 0 + gene: + hgnc_id: "HGNC:603" + hgnc_symbol: APOB + ensembl_gene_id: ENSG00000084674 + ncbi_gene_id: "338" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@APOB-2D-1.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@APOB-2D-1.snap new file mode 100644 index 0000000..6023b83 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@APOB-2D-1.snap @@ -0,0 +1,12 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +L2: + L2D1: + gene: + hgnc_id: "HGNC:603" + hgnc_symbol: APOB + ensembl_gene_id: ENSG00000084674 + ncbi_gene_id: "338" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@APOB-2D-3.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@APOB-2D-3.snap new file mode 100644 index 0000000..4a5e270 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@APOB-2D-3.snap @@ -0,0 +1,13 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +L2: + L2D3: + suggested_score: 0.3 + gene: + hgnc_id: "HGNC:603" + hgnc_symbol: APOB + ensembl_gene_id: ENSG00000084674 + ncbi_gene_id: "338" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@APOB-2D-4.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@APOB-2D-4.snap new file mode 100644 index 0000000..582d486 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@APOB-2D-4.snap @@ -0,0 +1,16 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +L2: + L2D4: + suggested_score: 0.9 + gene: + hgnc_id: "HGNC:603" + hgnc_symbol: APOB + ensembl_gene_id: ENSG00000084674 + ncbi_gene_id: "338" + exon_nos: + - 29 + - 28 + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@APOB-2E.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@APOB-2E.snap new file mode 100644 index 0000000..1e5c06e --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@APOB-2E.snap @@ -0,0 +1,9 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +L2: + L2E: + suggested_score: 0.9 + pvs1_result: Pvs1 + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@AVCRL2-2D-2.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@AVCRL2-2D-2.snap new file mode 100644 index 0000000..36b0879 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@AVCRL2-2D-2.snap @@ -0,0 +1,18 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +L2: + L2D2: + suggested_score: 0.9 + gene: + hgnc_id: "HGNC:175" + hgnc_symbol: ACVRL1 + ensembl_gene_id: ENSG00000139567 + ncbi_gene_id: "94" + clinvar_ids: + - VCV000008252 + - VCV000212796 + - VCV000372722 + - VCV000426035 + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@COL3A1-2C-1.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@COL3A1-2C-1.snap new file mode 100644 index 0000000..809ec6f --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@COL3A1-2C-1.snap @@ -0,0 +1,13 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +L2: + L2C1: + suggested_score: 0.9 + gene: + hgnc_id: "HGNC:2201" + hgnc_symbol: COL3A1 + ensembl_gene_id: ENSG00000168542 + ncbi_gene_id: "1281" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@COL3A1-2C-2.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@COL3A1-2C-2.snap new file mode 100644 index 0000000..192cb1a --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@COL3A1-2C-2.snap @@ -0,0 +1,13 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +L2: + L2C2: + suggested_score: 0 + gene: + hgnc_id: "HGNC:2201" + hgnc_symbol: COL3A1 + ensembl_gene_id: ENSG00000168542 + ncbi_gene_id: "1281" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@COL3A1-2D-1.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@COL3A1-2D-1.snap new file mode 100644 index 0000000..b9fe24b --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@COL3A1-2D-1.snap @@ -0,0 +1,12 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +L2: + L2D1: + gene: + hgnc_id: "HGNC:2201" + hgnc_symbol: COL3A1 + ensembl_gene_id: ENSG00000168542 + ncbi_gene_id: "1281" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@COL3A1-2D-3.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@COL3A1-2D-3.snap new file mode 100644 index 0000000..deac280 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@COL3A1-2D-3.snap @@ -0,0 +1,13 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +L2: + L2D3: + suggested_score: 0.3 + gene: + hgnc_id: "HGNC:2201" + hgnc_symbol: COL3A1 + ensembl_gene_id: ENSG00000168542 + ncbi_gene_id: "1281" + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@COL3A1-2D-4.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@COL3A1-2D-4.snap new file mode 100644 index 0000000..c2470c7 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@COL3A1-2D-4.snap @@ -0,0 +1,16 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +L2: + L2D4: + suggested_score: 0.9 + gene: + hgnc_id: "HGNC:2201" + hgnc_symbol: COL3A1 + ensembl_gene_id: ENSG00000168542 + ncbi_gene_id: "1281" + exon_nos: + - 50 + - 51 + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@COL3A1-2E.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@COL3A1-2E.snap new file mode 100644 index 0000000..1e5c06e --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section2__test__handle_cases_2c_2e@COL3A1-2E.snap @@ -0,0 +1,9 @@ +--- +source: src/strucvars/eval/del/section2.rs +expression: res +--- +L2: + L2E: + suggested_score: 0.9 + pvs1_result: Pvs1 + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section3__test__evaluate_l1b.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section3__test__evaluate_l1b.snap new file mode 100644 index 0000000..c61615a --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__section3__test__evaluate_l1b.snap @@ -0,0 +1,10 @@ +--- +source: src/strucvars/eval/del/section3.rs +expression: "&evaluator.evaluate(&Section::L1(L1::L1B(L1B {})))?" +--- +L3: + L3A: + suggested_score: 0 + num_genes: 0 + genes: [] + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__test__evaluate@1-11937319-11944386.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__test__evaluate@1-11937319-11944386.snap new file mode 100644 index 0000000..c7eb95a --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__test__evaluate@1-11937319-11944386.snap @@ -0,0 +1,12 @@ +--- +source: src/strucvars/eval/del/mod.rs +expression: result +--- +- L1: + L1B: {} +- L3: + L3A: + suggested_score: 0 + num_genes: 0 + genes: [] + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__test__evaluate@1-12050913-12054733.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__test__evaluate@1-12050913-12054733.snap new file mode 100644 index 0000000..40eb4b3 --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__test__evaluate@1-12050913-12054733.snap @@ -0,0 +1,29 @@ +--- +source: src/strucvars/eval/del/mod.rs +expression: result +--- +- L1: + L1A: + genes: + - gene: + hgnc_id: "HGNC:16877" + hgnc_symbol: MFN2 + ensembl_gene_id: ENSG00000116688 + ncbi_gene_id: "9927" + tx_ids: + - NM_001127660.2 + - NM_014874.4 +- L3: + L3A: + suggested_score: 0 + num_genes: 1 + genes: + - gene: + hgnc_id: "HGNC:16877" + hgnc_symbol: MFN2 + ensembl_gene_id: ENSG00000116688 + ncbi_gene_id: "9927" + tx_ids: + - NM_001127660.2 + - NM_014874.4 + diff --git a/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__test__evaluate@1-12098550-12103898.snap b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__test__evaluate@1-12098550-12103898.snap new file mode 100644 index 0000000..c7eb95a --- /dev/null +++ b/src/strucvars/eval/del/snapshots/scarus__strucvars__eval__del__test__evaluate@1-12098550-12103898.snap @@ -0,0 +1,12 @@ +--- +source: src/strucvars/eval/del/mod.rs +expression: result +--- +- L1: + L1B: {} +- L3: + L3A: + suggested_score: 0 + num_genes: 0 + genes: [] + diff --git a/src/strucvars/eval/dup/mod.rs b/src/strucvars/eval/dup/mod.rs new file mode 100644 index 0000000..4c0b2dc --- /dev/null +++ b/src/strucvars/eval/dup/mod.rs @@ -0,0 +1,4 @@ +//! Evaluation of duplications, gain of copy number. + +/// Enumeration of the categories for the structural variant evaluation. +pub enum Category {} diff --git a/src/strucvars/eval/mod.rs b/src/strucvars/eval/mod.rs new file mode 100644 index 0000000..7808b01 --- /dev/null +++ b/src/strucvars/eval/mod.rs @@ -0,0 +1,264 @@ +//! Implementation of the categories for the structural variant evaluation. + +pub mod common; +pub mod del; +pub mod dup; + +use std::{path::Path, sync::Arc}; + +use hgvs::data::interface::Provider; +use prost::Message as _; + +use super::data::clingen_dosage::Data as ClingenDosageData; +use super::data::decipher_hi::Data as DecipherHiData; +use super::data::gnomad::Data as GnomadConstraintData; +use super::data::hgnc::Data as GeneIdData; + +/// Evaluator for structural variants. +pub struct Evaluator { + /// The assembly to be used. + #[allow(dead_code)] + assembly: biocommons_bioutils::assemblies::Assembly, + + /// Mehari data/transcript provider. + provider: Arc, + /// Mapping from chromosome name to accession for the given assembly. + chrom_to_ac: rustc_hash::FxHashMap, + + /// The gene identifier data. + gene_id_data: GeneIdData, + /// The ClinGen dosage data. + clingen_dosage_data: ClingenDosageData, + /// The DECIPHER HI data. + decipher_hi_data: DecipherHiData, + /// The gnomAD constraint data. + gnomad_constraint_data: GnomadConstraintData, + + /// The ClinVar minimal database. + clinvar_db: Arc>, +} + +impl Evaluator { + /// Construct from the given paths. + /// + /// # Arguments + /// + /// * `path_tx_db` - Path to Mehari transcript database + /// * `path_hgnc` - Path to HGNC gene identifier mapping file + /// * `path_clingen_dosage_genes` - Path to the `ClinGen_gene_curation_list_GRCh37.tsv` file. + /// * `path_clingen_dosage_regions` - Path to the `ClinGen_region_curation_list_GRCh37.tsv` file. + /// * `path_decipher_hi` - Path to the `decipher_hi_prediction.tsv` file. + /// * `path_gnomad_constraints` - Path to the `gnomad_constraints.tsv` file. + /// * `path_clinvar_minimal` - Path to the "minimal" ClinVar RocksDB directory. + /// + /// # Returns + /// + /// A new `Evaluator`. + /// + /// # Errors + /// + /// If anything goes wrong, it returns a generic `anyhow::Error`. + #[allow(clippy::too_many_arguments)] + pub fn new( + assembly: biocommons_bioutils::assemblies::Assembly, + path_tx_db: P1, + path_hgnc: P2, + path_clingen_dosage_genes: P3, + path_clingen_dosage_regions: P4, + path_decipher_hi: P5, + path_gnomad_constraints: P6, + path_clinvar_minimal: P7, + ) -> Result + where + P1: AsRef, + P2: AsRef, + P3: AsRef, + P4: AsRef, + P5: AsRef, + P6: AsRef, + P7: AsRef, + { + let provider = Self::load_provider(assembly, path_tx_db.as_ref()) + .map_err(|e| anyhow::anyhow!("failed to load transcript database: {}", e))?; + let gene_id_data = GeneIdData::new(path_hgnc) + .map_err(|e| anyhow::anyhow!("failed to load gene identifier data: {}", e))?; + let clingen_dosage_data = ClingenDosageData::new( + path_clingen_dosage_genes, + path_clingen_dosage_regions, + &gene_id_data, + assembly.into(), + ) + .map_err(|e| anyhow::anyhow!("failed to load clingen dosage data: {}", e))?; + let decipher_hi_data = DecipherHiData::load(path_decipher_hi) + .map_err(|e| anyhow::anyhow!("failed to load decipher hi data: {}", e))?; + let gnomad_constraint_data = + GnomadConstraintData::load(path_gnomad_constraints, &gene_id_data) + .map_err(|e| anyhow::anyhow!("failed to load gnomAD constraint data: {}", e))?; + let (clinvar_db, _) = annonars::clinvar_minimal::cli::query::open_rocksdb( + path_clinvar_minimal, + "clinvar", + "meta", + ) + .map_err(|e| anyhow::anyhow!("failed to open 'minimal' ClinVar RocksDB: {}", e))?; + + Ok(Self { + assembly, + chrom_to_ac: Self::chrom_to_acc(assembly, &provider), + provider, + gene_id_data, + clingen_dosage_data, + decipher_hi_data, + gnomad_constraint_data, + clinvar_db, + }) + } + + fn load_provider( + assembly: biocommons_bioutils::assemblies::Assembly, + path_tx_db: &Path, + ) -> Result, anyhow::Error> { + tracing::info!("Opening transcript database"); + let tx_db = mehari::annotate::seqvars::load_tx_db(&format!("{}", path_tx_db.display(),)) + .map_err(|e| anyhow::anyhow!("failed to load transcript database: {}", e))?; + tracing::info!("Building transcript interval trees ..."); + let provider = Arc::new(mehari::annotate::seqvars::provider::Provider::new( + tx_db, + assembly, + mehari::annotate::seqvars::provider::ConfigBuilder::default() + // We only consider MANE/ManePlusClinical or longest transcript. + .transcript_picking(true) + .build() + .unwrap(), + )); + tracing::info!("... done building transcript interval trees"); + + Ok(provider) + } + + fn chrom_to_acc( + assembly: biocommons_bioutils::assemblies::Assembly, + provider: &Arc, + ) -> rustc_hash::FxHashMap { + let acc_to_chrom = provider.get_assembly_map(assembly); + let mut chrom_to_acc = rustc_hash::FxHashMap::default(); + for (acc, chrom) in &acc_to_chrom { + let chrom = if chrom.starts_with("chr") { + chrom.strip_prefix("chr").unwrap() + } else { + chrom + }; + chrom_to_acc.insert(chrom.to_string(), acc.clone()); + chrom_to_acc.insert(format!("chr{chrom}"), acc.clone()); + } + + chrom_to_acc + } + + /// Return the gene identifier data. + /// + /// # Returns + /// + /// The gene identifier data. + pub fn gene_id_data(&self) -> &GeneIdData { + &self.gene_id_data + } + + /// Query for ClinVar variants in 1-based range. + /// + /// # Arguments + /// + /// * `chrom` - Chromosome name. + /// * `start` - Start position (1-based). + /// * `stop` - Stop position (1-based). + /// + /// # Returns + /// + /// The ClinVar variants in the given range. + /// + /// # Errors + /// + /// If anything goes wrong, it returns a generic `anyhow::Error`. + pub fn query_clinvar_range( + &self, + chrom: &str, + start: u32, + stop: u32, + ) -> Result, anyhow::Error> { + tracing::trace!("starting clinvar query {}:{}-{}", &chrom, start, stop); + let start = start as i32; + let stop = stop as i32; + + // Obtain iterator and seek to start. + let cf_handle = self + .clinvar_db + .cf_handle("clinvar") + .expect("missing clinvar column family"); + let mut iter = self.clinvar_db.raw_iterator_cf(&cf_handle); + let pos_start = annonars::common::keys::Pos { + chrom: chrom.to_string(), + pos: start, + }; + let key_start: Vec = pos_start.into(); + tracing::debug!(" seeking to key {:?}", &key_start); + iter.seek(&key_start); + + // Cast stop to `keys::Pos`. + let pos_stop = annonars::common::keys::Pos { + chrom: chrom.to_string(), + pos: stop, + }; + let key_stop: Vec = pos_stop.into(); + tracing::debug!(" stop = {:?}", &key_stop); + + tracing::trace!("querying {}:{}-{}...", chrom, start, stop); + let mut result = Vec::new(); + let before_query = std::time::Instant::now(); + while iter.valid() { + if let Some(value) = iter.value() { + // Stop if we are behind the range end already. + let iter_key = iter.key().unwrap(); + let iter_pos: annonars::common::keys::Pos = iter_key.into(); + if iter_pos.chrom != chrom || iter_pos.pos > stop { + break; + } + + // Otherwise, decode the value from the database. + let record = annonars::clinvar_minimal::pbs::Record::decode(value)?; + result.push(record); + + // Proceed to the next database row. + iter.next(); + } else { + break; + } + } + tracing::trace!( + "... done querying {} records in {:?}", + result.len(), + before_query.elapsed() + ); + + Ok(result) + } +} + +#[cfg(test)] +pub mod test { + use super::Evaluator; + + /// Fixture with the global evaluator initialize with all data paths. + #[rstest::fixture] + pub fn global_evaluator_37() -> Evaluator { + Evaluator::new( + biocommons_bioutils::assemblies::Assembly::Grch37p10, + "tests/data/strucvars/hi/txs_example_hi.bin.zst", + "tests/data/hgnc.tsv", + "tests/data/strucvars/ClinGen_gene_curation_list_GRCh37.tsv", + "tests/data/strucvars/ClinGen_region_curation_list_GRCh37.tsv", + "tests/data/strucvars/decipher_hi_prediction.tsv", + "tests/data/strucvars/gnomad_constraints.tsv", + "tests/data/strucvars/hi/clinvar/rocksdb", + ) + .expect("could not initialize global evaluator") + } +} diff --git a/src/strucvars/mod.rs b/src/strucvars/mod.rs index 66ece16..412916d 100644 --- a/src/strucvars/mod.rs +++ b/src/strucvars/mod.rs @@ -1,11 +1,26 @@ //! ACMG rule prediction for structural variants. +pub mod data; +pub mod ds; +pub mod eval; + +use std::path::PathBuf; + use clap::Parser; +use crate::common::Assembly; + /// Command line arguments for `strucvars` command. #[derive(Parser, Debug)] #[command(about = "ACMG rule applications for structural variants", long_about = None)] -pub struct Args {} +pub struct Args { + /// The used assembly. + #[clap(long, value_enum)] + pub assembly: Assembly, + /// Path to Mehari transcript database file. + #[clap(long)] + pub mehari_tx_db: PathBuf, +} /// Main entry point for the `strucvars` command. /// diff --git a/tests/data/README.md b/tests/data/README.md new file mode 100644 index 0000000..86fdebb --- /dev/null +++ b/tests/data/README.md @@ -0,0 +1,4 @@ +# Test data for Scarus + +- `hgnc.tsv` -- + Mapping between the different gene identifiers. diff --git a/tests/data/hgnc.tsv b/tests/data/hgnc.tsv new file mode 100644 index 0000000..43aba6d --- /dev/null +++ b/tests/data/hgnc.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:198303a225141a62a1b7eafd138889b2a56ed78cdb3bca676f00e0569ecdd7a4 +size 1801068 diff --git a/tests/data/strucvars/ClinGen_gene_curation_list_GRCh37.tsv b/tests/data/strucvars/ClinGen_gene_curation_list_GRCh37.tsv new file mode 100644 index 0000000..2957da2 --- /dev/null +++ b/tests/data/strucvars/ClinGen_gene_curation_list_GRCh37.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93a0c921361f950c4ecbb21e810e34c172df87948f9619e523a4182d692b712b +size 244103 diff --git a/tests/data/strucvars/ClinGen_region_curation_list_GRCh37.tsv b/tests/data/strucvars/ClinGen_region_curation_list_GRCh37.tsv new file mode 100644 index 0000000..6393909 --- /dev/null +++ b/tests/data/strucvars/ClinGen_region_curation_list_GRCh37.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d2d20759c1666e564d20fa5de2ebf680cb5aa23738fb062525e9aa03c8841cb +size 98878 diff --git a/tests/data/strucvars/ClinGen_region_curation_list_GRCh38.tsv b/tests/data/strucvars/ClinGen_region_curation_list_GRCh38.tsv new file mode 100644 index 0000000..44576f3 --- /dev/null +++ b/tests/data/strucvars/ClinGen_region_curation_list_GRCh38.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89c1e9f5bfae45ee9c6fd5b1538d64541243ff09bf5b6f1fe506dee922728923 +size 98741 diff --git a/tests/data/strucvars/README.md b/tests/data/strucvars/README.md new file mode 100644 index 0000000..82cb9a7 --- /dev/null +++ b/tests/data/strucvars/README.md @@ -0,0 +1,4 @@ +# Test cases for ACMG rule evaluation for CNVs + +- `ClinGen_{gene,region}_curation_list_GRCh37.tsv` -- + Curated genes and regions from ClinGen regarding dosage sensitivity. diff --git a/tests/data/strucvars/bootstrap.sh b/tests/data/strucvars/bootstrap.sh new file mode 100644 index 0000000..21fc64a --- /dev/null +++ b/tests/data/strucvars/bootstrap.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env + +SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" + +set -x + +cd $SCRIPTPATH + +rm -f \ + ClinGen_region_curation_list_GRCh37.tsv \ + ClinGen_region_curation_list_GRCh38.tsv \ + ClinGen_gene_curation_list_GRCh37.tsv + +wget -q \ + http://ftp.clinicalgenome.org/ClinGen_region_curation_list_GRCh37.tsv \ + http://ftp.clinicalgenome.org/ClinGen_region_curation_list_GRCh38.tsv \ + http://ftp.clinicalgenome.org/ClinGen_gene_curation_list_GRCh37.tsv diff --git a/tests/data/strucvars/decipher_hi_prediction.tsv b/tests/data/strucvars/decipher_hi_prediction.tsv new file mode 100644 index 0000000..7adfd6c --- /dev/null +++ b/tests/data/strucvars/decipher_hi_prediction.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca9085b1b4dd9fba5cf7e8a7c5b257ac68fd76ca114c986fcc3ddb59ef4a1693 +size 625076 diff --git a/tests/data/strucvars/gnomad_constraints.tsv b/tests/data/strucvars/gnomad_constraints.tsv new file mode 100644 index 0000000..ea52787 --- /dev/null +++ b/tests/data/strucvars/gnomad_constraints.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d80cd05c61519e846e18885182c03a8e529e40db4d57144609cdcd115b869d7 +size 4371448 diff --git a/tests/data/strucvars/hi/README.md b/tests/data/strucvars/hi/README.md new file mode 100644 index 0000000..3351aec --- /dev/null +++ b/tests/data/strucvars/hi/README.md @@ -0,0 +1,48 @@ +# Test data with example for haploinsufficiency (HI) + +We create test data for HI using the following data. + +## Established HI Genes + +We use the following two HI genes on forward/reverse strand for core test data. + +- gene: APOB + - chr2:21224301-21266945 + - reverse strand +- gene: COL3A1 + - chr2:189839099-189877472 + - forward strand +- gene: ACVRL1 + - has pathogenic variants in last coding exon + - chr12:52301202-52317145 + - forward strand + +## Established Non-HI genes + +- gene: COL16A1 + - chr1:32117848-32169768 + - reverse strand + +## Predicted Non-Established Genes + +The following genes are predicted to be HI but are not in ClinGen. +We use them for extended test data. + +- gene: MFN2 + - pLI: 0.99 + - NOT in DECIPHER HI + - chr1:12,040,238-12,073,572 + - forward strand +- gene: REV3L + - is in DECIPHER HI + - chr6:111,620,234-111,804,432 + - reverse strand +- gene: RERE + - is in DECIPHER HI + - chr1:8,412,464-8,877,699 + - reverse strand + +## ClinVar Variants + +- we extract all ClinVar variants in all genes from above +- also, we build an "empty" clinvar file with no variants diff --git a/tests/data/strucvars/hi/bootstrap.sh b/tests/data/strucvars/hi/bootstrap.sh new file mode 100644 index 0000000..aca3f1c --- /dev/null +++ b/tests/data/strucvars/hi/bootstrap.sh @@ -0,0 +1,65 @@ +#!/usr/bin/bash + +SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" + +set -x +set -euo pipefail + +# See README.md for a description of genes and methodology. + +export TMPDIR=$(mktemp -d) +trap "rm -rf $TMPDIR" EXIT + +# -- transcripts -- + +gene_symbols=" +ACVRL1 +APOB +COL3A1 + +MFN2 +REV3L +RERE +" + +mehari db create \ + -vvv \ + --genome-release grch37 \ + --path-mane-txs-tsv $SCRIPTPATH/tx-mane.tsv \ + --path-out $SCRIPTPATH/txs_example_hi.bin.zst \ + --path-cdot-json ~/mehari-data-tx/cdot-refseq-vep110~0+0.2.21.json.gz \ + --path-seqrepo-instance ~/mehari-data-tx/seqrepo/main \ + $(for gene_symbol in $gene_symbols; do echo --gene-symbols $gene_symbol; done) + +# -- clinvar (filled) -- + +wget -O $TMPDIR/annonars-clinvar-minimal-grch37-20231112+0.24.2.tar.gz \ + https://github.com/bihealth/annonars-data-clinvar/releases/download/annonars-data-clinvar-20231112/annonars-clinvar-minimal-grch37-20231112+0.24.2.tar.gz +tar -C $TMPDIR -xf $TMPDIR/annonars-clinvar-minimal-grch37-20231112+0.24.2.tar.gz + +cat < $TMPDIR/regions.bed +1 8412464 8877699 +1 12040238 12073572 +2 21224301 21266945 +2 189839099 189877472 +6 111620234 111804432 +12 52314487 52314714 +EOF + +mkdir -p tests/data/strucvars/hi/clinvar +rm -rf tests/data/strucvars/hi/clinvar/rocksdb + +annonars db-utils copy -vvv \ + --path-in $TMPDIR/annonars-clinvar-minimal-grch37-20231112+0.24.2/rocksdb \ + --path-out tests/data/strucvars/hi/clinvar/rocksdb \ + --path-beds $TMPDIR/regions.bed + +# -- clinvar (empty) -- + +mkdir -p tests/data/strucvars/hi/clinvar-empty +rm -rf tests/data/strucvars/hi/clinvar-empty/rocksdb + +annonars db-utils copy -vvv \ + --path-in $TMPDIR/annonars-clinvar-minimal-grch37-20231112+0.24.2/rocksdb \ + --path-out tests/data/strucvars/hi/clinvar-empty/rocksdb \ + --path-beds /dev/null diff --git a/tests/data/strucvars/hi/clinvar-empty/rocksdb/000014.sst b/tests/data/strucvars/hi/clinvar-empty/rocksdb/000014.sst new file mode 100644 index 0000000..75c4318 --- /dev/null +++ b/tests/data/strucvars/hi/clinvar-empty/rocksdb/000014.sst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ed1ee8343514393cd43ddb96c376771d1a472f56e990a77e7d38a8eb575798e +size 1343 diff --git a/tests/data/strucvars/hi/clinvar-empty/rocksdb/CURRENT b/tests/data/strucvars/hi/clinvar-empty/rocksdb/CURRENT new file mode 100644 index 0000000..f8d5048 --- /dev/null +++ b/tests/data/strucvars/hi/clinvar-empty/rocksdb/CURRENT @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c283f6e81028b9eb0760d918ee4bc0aa256ed3b926393c1734c760c4bd724fd +size 16 diff --git a/tests/data/strucvars/hi/clinvar-empty/rocksdb/IDENTITY b/tests/data/strucvars/hi/clinvar-empty/rocksdb/IDENTITY new file mode 100644 index 0000000..f3d49fb --- /dev/null +++ b/tests/data/strucvars/hi/clinvar-empty/rocksdb/IDENTITY @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1524ffe89a10fc2c7fa9a9d8af9f0a3aa0b5c6174d5d73a613e294353369504e +size 36 diff --git a/tests/data/strucvars/hi/clinvar-empty/rocksdb/LOCK b/tests/data/strucvars/hi/clinvar-empty/rocksdb/LOCK new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/strucvars/hi/clinvar-empty/rocksdb/LOG b/tests/data/strucvars/hi/clinvar-empty/rocksdb/LOG new file mode 100644 index 0000000..fd0be3b --- /dev/null +++ b/tests/data/strucvars/hi/clinvar-empty/rocksdb/LOG @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5415e6eb326238aa24d02c39a655ec5ee3691a3f1c7d1a2354a6810ff605b841 +size 54676 diff --git a/tests/data/strucvars/hi/clinvar-empty/rocksdb/MANIFEST-000005 b/tests/data/strucvars/hi/clinvar-empty/rocksdb/MANIFEST-000005 new file mode 100644 index 0000000..097a1f9 --- /dev/null +++ b/tests/data/strucvars/hi/clinvar-empty/rocksdb/MANIFEST-000005 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59fd3b0350e2d3f988810c554caa87d9fc7e327ab1ca347cf31b2fc6406b8359 +size 427 diff --git a/tests/data/strucvars/hi/clinvar-empty/rocksdb/OPTIONS-000009 b/tests/data/strucvars/hi/clinvar-empty/rocksdb/OPTIONS-000009 new file mode 100644 index 0000000..cdbb62c --- /dev/null +++ b/tests/data/strucvars/hi/clinvar-empty/rocksdb/OPTIONS-000009 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29fe73c75e82934fb039297c14bbfd05fff6c95ef7764c24fc6d5dfbcfe6aca4 +size 15368 diff --git a/tests/data/strucvars/hi/clinvar-empty/rocksdb/OPTIONS-000011 b/tests/data/strucvars/hi/clinvar-empty/rocksdb/OPTIONS-000011 new file mode 100644 index 0000000..cdbb62c --- /dev/null +++ b/tests/data/strucvars/hi/clinvar-empty/rocksdb/OPTIONS-000011 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29fe73c75e82934fb039297c14bbfd05fff6c95ef7764c24fc6d5dfbcfe6aca4 +size 15368 diff --git a/tests/data/strucvars/hi/clinvar/rocksdb/000014.sst b/tests/data/strucvars/hi/clinvar/rocksdb/000014.sst new file mode 100644 index 0000000..fca82a8 --- /dev/null +++ b/tests/data/strucvars/hi/clinvar/rocksdb/000014.sst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ae42d4011259196362588b61f9244d1742f44f12043de3a4ddea4265b2cb67a +size 1343 diff --git a/tests/data/strucvars/hi/clinvar/rocksdb/000016.sst b/tests/data/strucvars/hi/clinvar/rocksdb/000016.sst new file mode 100644 index 0000000..ef1085f --- /dev/null +++ b/tests/data/strucvars/hi/clinvar/rocksdb/000016.sst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d64b1fc216bd2dc82703eeaed20e0a524352ed1a4895efa3f279e67bf1a90314 +size 326889 diff --git a/tests/data/strucvars/hi/clinvar/rocksdb/CURRENT b/tests/data/strucvars/hi/clinvar/rocksdb/CURRENT new file mode 100644 index 0000000..f8d5048 --- /dev/null +++ b/tests/data/strucvars/hi/clinvar/rocksdb/CURRENT @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c283f6e81028b9eb0760d918ee4bc0aa256ed3b926393c1734c760c4bd724fd +size 16 diff --git a/tests/data/strucvars/hi/clinvar/rocksdb/IDENTITY b/tests/data/strucvars/hi/clinvar/rocksdb/IDENTITY new file mode 100644 index 0000000..07eb3c7 --- /dev/null +++ b/tests/data/strucvars/hi/clinvar/rocksdb/IDENTITY @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d96b9817bea16d195a8e148a9ca87cd2436f95ce9bd6b3ae0101d5da9f217973 +size 36 diff --git a/tests/data/strucvars/hi/clinvar/rocksdb/LOCK b/tests/data/strucvars/hi/clinvar/rocksdb/LOCK new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/strucvars/hi/clinvar/rocksdb/LOG b/tests/data/strucvars/hi/clinvar/rocksdb/LOG new file mode 100644 index 0000000..9253ed7 --- /dev/null +++ b/tests/data/strucvars/hi/clinvar/rocksdb/LOG @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27c25bb1f1ef171bf56e98b6bd0b7bf42b7c21bc604ba771cbd185c39bc63d1a +size 62037 diff --git a/tests/data/strucvars/hi/clinvar/rocksdb/MANIFEST-000005 b/tests/data/strucvars/hi/clinvar/rocksdb/MANIFEST-000005 new file mode 100644 index 0000000..897c3ae --- /dev/null +++ b/tests/data/strucvars/hi/clinvar/rocksdb/MANIFEST-000005 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c65c7f82a54b5a56b61d5c1316defca1b17b6b73167e529838ce54a69ff2846 +size 665 diff --git a/tests/data/strucvars/hi/clinvar/rocksdb/OPTIONS-000009 b/tests/data/strucvars/hi/clinvar/rocksdb/OPTIONS-000009 new file mode 100644 index 0000000..cdbb62c --- /dev/null +++ b/tests/data/strucvars/hi/clinvar/rocksdb/OPTIONS-000009 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29fe73c75e82934fb039297c14bbfd05fff6c95ef7764c24fc6d5dfbcfe6aca4 +size 15368 diff --git a/tests/data/strucvars/hi/clinvar/rocksdb/OPTIONS-000011 b/tests/data/strucvars/hi/clinvar/rocksdb/OPTIONS-000011 new file mode 100644 index 0000000..cdbb62c --- /dev/null +++ b/tests/data/strucvars/hi/clinvar/rocksdb/OPTIONS-000011 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29fe73c75e82934fb039297c14bbfd05fff6c95ef7764c24fc6d5dfbcfe6aca4 +size 15368 diff --git a/tests/data/strucvars/hi/tx-mane.tsv b/tests/data/strucvars/hi/tx-mane.tsv new file mode 100644 index 0000000..4d6c6cc --- /dev/null +++ b/tests/data/strucvars/hi/tx-mane.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a0f0376555109a514d48d59deacc8f6cce406bb6188f65fd02b1b2e148cb2e5 +size 5259153 diff --git a/tests/data/strucvars/hi/txs_example_hi.bin.zst b/tests/data/strucvars/hi/txs_example_hi.bin.zst new file mode 100644 index 0000000..7337217 --- /dev/null +++ b/tests/data/strucvars/hi/txs_example_hi.bin.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca2ddc3262ef613e3d7b62fe996caa1d485088d405e080af9966e7548cc4aded +size 21416 diff --git a/tests/data/strucvars/hi/txs_example_hi.bin.zst.report b/tests/data/strucvars/hi/txs_example_hi.bin.zst.report new file mode 100644 index 0000000..ad475de --- /dev/null +++ b/tests/data/strucvars/hi/txs_example_hi.bin.zst.report @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49341f6e042e3b692aad863fa440208ba3b12f4f87712258658e8cea6e149ac6 +size 420 diff --git a/tests/data/strucvars/rec/README.md b/tests/data/strucvars/rec/README.md new file mode 100644 index 0000000..87a5e47 --- /dev/null +++ b/tests/data/strucvars/rec/README.md @@ -0,0 +1,5 @@ +# Test data with example for LoF-tolerance / recessive inheritance + +- gene: MTHFR +- pLI: 0.00 +- disease: OMIM:236250 Methlyenetetrahydrofolate reductase deficiency diff --git a/tests/data/strucvars/ts/README.md b/tests/data/strucvars/ts/README.md new file mode 100644 index 0000000..a68c00e --- /dev/null +++ b/tests/data/strucvars/ts/README.md @@ -0,0 +1 @@ +# Test data with example for triplosensitivity (TS)