diff --git a/src/common.rs b/src/common.rs index 44175e9e..c33e798f 100644 --- a/src/common.rs +++ b/src/common.rs @@ -514,30 +514,6 @@ where Ok(buffer) } -/// Enum for the supported gene/transcript databases. -#[derive( - serde::Serialize, - serde::Deserialize, - enum_map::Enum, - PartialEq, - Eq, - Clone, - Copy, - Debug, - Default, - strum::EnumString, - strum::Display, -)] -#[serde(rename_all = "lowercase")] -#[strum(serialize_all = "lowercase")] -pub enum Database { - /// RefSeq - #[default] - RefSeq, - /// ENSEMBL - Ensembl, -} - /// Enum for the supported TADs. #[derive( serde::Serialize, diff --git a/src/main.rs b/src/main.rs index acf8bd27..4ca2bca2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -35,7 +35,7 @@ enum Commands { Seqvars(Seqvars), } -/// Parsing of "sv *" sub commands. +/// Parsing of "strucvars *" sub commands. #[derive(Debug, Args)] #[command(args_conflicts_with_subcommands = true)] struct Strucvars { @@ -44,7 +44,7 @@ struct Strucvars { command: StrucvarsCommands, } -/// Enum supporting the parsing of "sv *" sub commands. +/// Enum supporting the parsing of "strucvars *" sub commands. #[derive(Debug, Subcommand)] enum StrucvarsCommands { Aggregate(strucvars::aggregate::cli::Args), @@ -62,7 +62,7 @@ struct Seqvars { command: SeqvarsCommands, } -/// Enum supporting the parsing of "sv *" sub commands. +/// Enum supporting the parsing of "strucvars *" sub commands. #[derive(Debug, Subcommand)] enum SeqvarsCommands { Aggregate(seqvars::aggregate::Args), diff --git a/src/proto/varfish/v1/sv.proto b/src/proto/varfish/v1/sv.proto index 9f767394..f2b131c8 100644 --- a/src/proto/varfish/v1/sv.proto +++ b/src/proto/varfish/v1/sv.proto @@ -56,24 +56,6 @@ message MaskedDatabase { repeated MaskedDbRecord records = 1; } -// Record for the gene region database. -message GeneRegionRecord { - // Numeric chromosome number. - int32 chrom_no = 1; - // 1-based start position. - int32 start = 2; - // 1-based stop position. - int32 stop = 3; - // Numeric gene identifier. - uint32 gene_id = 4; -} - -// Gene region database. -message GeneRegionDatabase { - // List of gene region records. - repeated GeneRegionRecord records = 1; -} - // Record for the gene cross-link database. message XlinkRecord { // HGNC ID (e.g., "HGNC:123") diff --git a/src/strucvars/aggregate/cli.rs b/src/strucvars/aggregate/cli.rs index 95d5c9c8..cf6f331a 100644 --- a/src/strucvars/aggregate/cli.rs +++ b/src/strucvars/aggregate/cli.rs @@ -279,7 +279,7 @@ pub struct Args { pub slack_ins: i32, } -/// Main entry point for the `sv bg-db-to-bin` command. +/// Main entry point for the `strucvars txt-to-bin` command. pub fn run(common_args: &crate::common::Args, args: &Args) -> Result<(), anyhow::Error> { tracing::info!("Starting `strucvars aggregate`"); tracing::info!(" common_args = {:?}", &common_args); diff --git a/src/strucvars/query/bgdbs.rs b/src/strucvars/query/bgdbs.rs index 50311ec3..8286ff71 100644 --- a/src/strucvars/query/bgdbs.rs +++ b/src/strucvars/query/bgdbs.rs @@ -133,7 +133,7 @@ impl BeginEnd for BgDbRecord { } } -/// Load background database from a `.bin` file as created by `sv convert-bgdb`. +/// Load background database from a `.bin` file as created by `strucvar txt-to-bin`. #[tracing::instrument] pub fn load_bg_db_records(path: &Path) -> Result { tracing::debug!("loading binary bg db records from {:?}", path); diff --git a/src/strucvars/query/genes.rs b/src/strucvars/query/genes.rs index 12ffcf48..4d15d098 100644 --- a/src/strucvars/query/genes.rs +++ b/src/strucvars/query/genes.rs @@ -1,90 +1,13 @@ //! Code for supporting annotation with overlapping genes. -use std::{collections::HashSet, ops::Range, path::Path, time::Instant}; +use std::{collections::HashSet, path::Path, time::Instant}; -use bio::data_structures::interval_tree::ArrayBackedIntervalTree; use mehari::common::open_read_maybe_gz; use prost::Message; use serde::Deserialize; use tracing::info; -use crate::{ - common::{Database, GenomeRelease, CHROMS}, - strucvars::pbs, -}; - -/// Alias for the interval tree that we use. -type IntervalTree = ArrayBackedIntervalTree; - -/// Information to store for a TAD set entry. -#[derive(Default, Clone, Debug)] -pub struct GeneRegionRecord { - /// 0-based begin position. - pub begin: i32, - /// End position. - pub end: i32, - /// gene ID - pub gene_id: u32, -} - -/// Gene region overlapping information. -#[derive(Default, Debug)] -pub struct GeneRegionDb { - /// Records, stored by chromosome. - pub records: Vec>, - /// Interval trees, stored by chromosome. - pub trees: Vec, -} - -impl GeneRegionDb { - /// Return IDs of overlapping genes. - pub fn overlapping_gene_ids(&self, chrom_no: u32, query: Range) -> Vec { - self.trees[chrom_no as usize] - .find(query) - .iter() - .map(|cursor| self.records[chrom_no as usize][*cursor.data() as usize].gene_id) - .collect() - } -} - -#[tracing::instrument] -fn load_gene_regions_db(path: &Path) -> Result { - tracing::debug!("loading binary gene region records from {:?}...", path); - - let before_loading = Instant::now(); - let mut result = GeneRegionDb::default(); - for _ in CHROMS { - result.records.push(Vec::new()); - result.trees.push(IntervalTree::new()); - } - - let fcontents = - std::fs::read(path).map_err(|e| anyhow::anyhow!("error reading {:?}: {}", &path, e))?; - let gene_region_db = pbs::GeneRegionDatabase::decode(std::io::Cursor::new(fcontents)) - .map_err(|e| anyhow::anyhow!("error decoding {:?}: {}", &path, e))?; - - let mut total_count = 0; - for record in gene_region_db.records.into_iter() { - let chrom_no = record.chrom_no as usize; - let key = (record.start - 1)..record.stop; - result.trees[chrom_no].insert(key, result.records[chrom_no].len() as u32); - result.records[chrom_no].push(GeneRegionRecord { - begin: record.start - 1, - end: record.stop, - gene_id: record.gene_id, - }); - - total_count += 1; - } - result.trees.iter_mut().for_each(|tree| tree.index()); - tracing::debug!( - "... done loading {} records and building trees in {:?}", - total_count, - before_loading.elapsed(), - ); - - Ok(result) -} +use crate::{common::GenomeRelease, strucvars::pbs}; /// Information to store for the interlink table. #[derive(Default, Debug)] @@ -108,43 +31,6 @@ pub struct XlinkDb { pub from_hgnc: multimap::MultiMap, } -impl XlinkDb { - fn entrez_id_to_symbols(&self, entrez_id: u32) -> Vec { - let mut tmp = self - .from_entrez - .get_vec(&entrez_id) - .map_or(Vec::new(), |idxs| { - idxs.iter() - .map(|idx| self.records[*idx as usize].symbol.clone()) - .collect::>() - }); - tmp.sort(); - tmp.dedup(); - tmp - } - - fn ensembl_to_symbols(&self, ensembl_id: u32) -> Vec { - let mut tmp = self - .from_ensembl - .get_vec(&ensembl_id) - .map_or(Vec::new(), |idxs| { - idxs.iter() - .map(|idx| self.records[*idx as usize].symbol.clone()) - .collect::>() - }); - tmp.sort(); - tmp.dedup(); - tmp - } - - pub fn gene_id_to_symbols(&self, database: Database, gene_id: u32) -> Vec { - match database { - Database::RefSeq => self.entrez_id_to_symbols(gene_id), - Database::Ensembl => self.ensembl_to_symbols(gene_id), - } - } -} - #[tracing::instrument] fn load_xlink_db(path: &Path) -> Result { tracing::debug!("loading binary xlink records from {:?}...", path); @@ -287,44 +173,17 @@ fn load_mim2gene_db(path: &Path) -> Result { /// Bundle of gene region DBs and the xlink info packaged with VarFish. #[derive(Default, Debug)] pub struct GeneDb { - pub refseq: GeneRegionDb, - pub ensembl: GeneRegionDb, pub xlink: XlinkDb, pub acmg: AcmgDb, pub mim2gene: OmimDb, } -impl GeneDb { - /// Return IDs of overlapping genes. - pub fn overlapping_gene_ids( - &self, - database: Database, - chrom_no: u32, - query: Range, - ) -> Vec { - match database { - Database::RefSeq => self.refseq.overlapping_gene_ids(chrom_no, query), - Database::Ensembl => self.ensembl.overlapping_gene_ids(chrom_no, query), - } - } -} - // Load all gene information, such as region, id mapping and symbols. #[tracing::instrument] pub fn load_gene_db(path_db: &str, genome_release: GenomeRelease) -> Result { info!("Loading gene dbs"); let result = GeneDb { - refseq: load_gene_regions_db( - Path::new(path_db) - .join(format!("{}/genes/refseq_regions.bin", genome_release)) - .as_path(), - )?, - ensembl: load_gene_regions_db( - Path::new(path_db) - .join(format!("{}/genes/ensembl_regions.bin", genome_release)) - .as_path(), - )?, xlink: load_xlink_db(Path::new(path_db).join("noref/genes/xlink.bin").as_path())?, acmg: load_acmg_db(Path::new(path_db).join("noref/genes/acmg.tsv").as_path())?, mim2gene: load_mim2gene_db( diff --git a/src/strucvars/query/interpreter.rs b/src/strucvars/query/interpreter.rs index b19c4984..0f333d02 100644 --- a/src/strucvars/query/interpreter.rs +++ b/src/strucvars/query/interpreter.rs @@ -30,7 +30,7 @@ pub fn overlaps(s1: i32, e1: i32, s2: i32, e2: i32) -> bool { #[derive(Debug)] pub struct QueryInterpreter { pub query: CaseQuery, - pub gene_allowlist: Option>, + pub hgvs_allowlist: Option>, } /// Result type for `QueryInterpreter::passes_genotype()`. @@ -49,10 +49,10 @@ pub struct PassesResult { impl QueryInterpreter { /// Construct new `QueryInterpreter` with the given query settings. - pub fn new(query: CaseQuery, gene_allowlist: Option>) -> Self { + pub fn new(query: CaseQuery, hgvs_allowlist: Option>) -> Self { QueryInterpreter { query, - gene_allowlist, + hgvs_allowlist, } } @@ -302,13 +302,14 @@ impl QueryInterpreter { } /// Determine whether the `sv` passes the gene allow list filter. - pub fn passes_genes(&self, ovl_gene_ids: &[u32]) -> bool { - if let Some(gene_allowlist) = self.gene_allowlist.as_ref() { - if gene_allowlist.is_empty() { + pub fn passes_genes(&self, ovl_hgvs_ids: &[String]) -> bool { + if let Some(hgvs_allowlist) = self.hgvs_allowlist.as_ref() { + if hgvs_allowlist.is_empty() { true } else { - let ovl_gene_ids: HashSet = HashSet::from_iter(ovl_gene_ids.iter().copied()); - !ovl_gene_ids.is_disjoint(gene_allowlist) + let ovl_hgvs_ids: HashSet = + HashSet::from_iter(ovl_hgvs_ids.iter().cloned()); + !ovl_hgvs_ids.is_disjoint(hgvs_allowlist) } } else { true @@ -329,18 +330,18 @@ impl QueryInterpreter { } /// Determine whether the annotated `StructuralVariant` passes all criteria. - pub fn passes( + pub fn passes( &self, sv: &StructuralVariant, count_bg: &mut CountBg, count_masked: &mut CountMasked, - ovl_gene_ids: &mut OvlGeneIds, + ovl_hgvs_ids: &mut OvlHgvsIds, tx_effects: &mut TxEffects, ) -> Result where CountBg: FnMut(&StructuralVariant) -> BgDbOverlaps, CountMasked: FnMut(&StructuralVariant) -> MaskedBreakpointCount, - OvlGeneIds: FnMut(&StructuralVariant) -> Vec, + OvlHgvsIds: FnMut(&StructuralVariant) -> Vec, TxEffects: FnMut(&StructuralVariant) -> Vec, { // We first check for matching genotype. If this succeeds then we execute the @@ -354,7 +355,7 @@ impl QueryInterpreter { let passes_result = self.passes_genotype(sv, &count_masked(sv))?; if !passes_result.pass_all { Ok(Default::default()) - } else if !self.passes_genes(&ovl_gene_ids(sv)) { + } else if !self.passes_genes(&ovl_hgvs_ids(sv)) { trace!("... SV does not gene allow list filter"); Ok(Default::default()) } else if !self.passes_counts(&count_bg(sv)) { @@ -379,9 +380,8 @@ mod tests { use indexmap::IndexMap; use mehari::annotate::strucvars::csq::interface::StrandOrientation; - use crate::{ - common::Database, - strucvars::query::schema::{CallInfo, GenomicRegion, GenotypeChoice, GenotypeCriteria}, + use crate::strucvars::query::schema::{ + CallInfo, GenomicRegion, GenotypeChoice, GenotypeCriteria, }; use super::*; @@ -398,7 +398,7 @@ mod tests { #[test] fn test_query_interpreter_smoke() { - let query = CaseQuery::new(Database::RefSeq); + let query = CaseQuery::default(); let _interpreter = QueryInterpreter::new(query, None); } @@ -406,7 +406,7 @@ mod tests { fn test_query_interpreter_passes_simple_pass_sv_type() { let query = CaseQuery { sv_types: vec![SvType::Del], - ..CaseQuery::new(Database::RefSeq) + ..Default::default() }; let interpreter = QueryInterpreter::new(query, None); @@ -429,7 +429,7 @@ mod tests { fn test_query_interpreter_passes_simple_fail_sv_type() { let query = CaseQuery { sv_types: vec![SvType::Ins], - ..CaseQuery::new(Database::RefSeq) + ..Default::default() }; let interpreter = QueryInterpreter::new(query, None); @@ -452,7 +452,7 @@ mod tests { fn test_query_interpreter_passes_simple_fail_sv_sub_type() { let query = CaseQuery { sv_sub_types: vec![SvSubType::Ins], - ..CaseQuery::new(Database::RefSeq) + ..Default::default() }; let interpreter = QueryInterpreter::new(query, None); @@ -476,7 +476,7 @@ mod tests { let query = CaseQuery { sv_size_min: Some(50), sv_size_max: Some(500), - ..CaseQuery::new(Database::RefSeq) + ..Default::default() }; let interpreter = QueryInterpreter::new(query, None); @@ -499,7 +499,7 @@ mod tests { fn test_query_interpreter_passes_simple_fail_size_min() { let query = CaseQuery { sv_size_min: Some(5000), - ..CaseQuery::new(Database::RefSeq) + ..CaseQuery::default() }; let interpreter = QueryInterpreter::new(query, None); @@ -522,7 +522,7 @@ mod tests { fn test_query_interpreter_passes_simple_fail_size_max() { let query = CaseQuery { sv_size_max: Some(1), - ..CaseQuery::new(Database::RefSeq) + ..CaseQuery::default() }; let interpreter = QueryInterpreter::new(query, None); @@ -545,7 +545,7 @@ mod tests { fn test_query_interpreter_passes_genomic_region_pass_linear_overlap() { let query = CaseQuery { genomic_region: Some(vec![GenomicRegion::new("chr1", 150, 160)]), - ..CaseQuery::new(Database::RefSeq) + ..CaseQuery::default() }; let interpreter = QueryInterpreter::new(query, None); @@ -568,7 +568,7 @@ mod tests { fn test_query_interpreter_passes_genomic_region_pass_linear_whole_chromosome() { let query = CaseQuery { genomic_region: Some(vec![GenomicRegion::whole_chrom("chr1")]), - ..CaseQuery::new(Database::RefSeq) + ..CaseQuery::default() }; let interpreter = QueryInterpreter::new(query, None); @@ -591,7 +591,7 @@ mod tests { fn test_query_interpreter_passes_genomic_region_fail_linear_overlap() { let query = CaseQuery { genomic_region: Some(vec![GenomicRegion::new("chr1", 201, 250)]), - ..CaseQuery::new(Database::RefSeq) + ..CaseQuery::default() }; let interpreter = QueryInterpreter::new(query, None); @@ -614,7 +614,7 @@ mod tests { fn test_query_interpreter_passes_genomic_region_fail_linear_whole_chromosome() { let query = CaseQuery { genomic_region: Some(vec![GenomicRegion::whole_chrom("chr2")]), - ..CaseQuery::new(Database::RefSeq) + ..CaseQuery::default() }; let interpreter = QueryInterpreter::new(query, None); @@ -637,7 +637,7 @@ mod tests { fn test_query_interpreter_passes_genomic_region_pass_ins_overlap() { let query = CaseQuery { genomic_region: Some(vec![GenomicRegion::new("chr1", 150, 160)]), - ..CaseQuery::new(Database::RefSeq) + ..CaseQuery::default() }; let interpreter = QueryInterpreter::new(query, None); @@ -660,7 +660,7 @@ mod tests { fn test_query_interpreter_passes_genomic_region_pass_ins_whole_chromosome() { let query = CaseQuery { genomic_region: Some(vec![GenomicRegion::whole_chrom("chr1")]), - ..CaseQuery::new(Database::RefSeq) + ..CaseQuery::default() }; let interpreter = QueryInterpreter::new(query, None); @@ -683,7 +683,7 @@ mod tests { fn test_query_interpreter_passes_genomic_region_fail_ins_overlap() { let query = CaseQuery { genomic_region: Some(vec![GenomicRegion::new("chr1", 201, 250)]), - ..CaseQuery::new(Database::RefSeq) + ..CaseQuery::default() }; let interpreter = QueryInterpreter::new(query, None); @@ -706,7 +706,7 @@ mod tests { fn test_query_interpreter_passes_genomic_region_fail_ins_whole_chromosome() { let query = CaseQuery { genomic_region: Some(vec![GenomicRegion::whole_chrom("chr2")]), - ..CaseQuery::new(Database::RefSeq) + ..CaseQuery::default() }; let interpreter = QueryInterpreter::new(query, None); @@ -729,7 +729,7 @@ mod tests { fn test_query_interpreter_passes_genomic_region_pass_bnd_overlap_pos() { let query = CaseQuery { genomic_region: Some(vec![GenomicRegion::new("chr1", 150, 160)]), - ..CaseQuery::new(Database::RefSeq) + ..CaseQuery::default() }; let interpreter = QueryInterpreter::new(query, None); @@ -752,7 +752,7 @@ mod tests { fn test_query_interpreter_passes_genomic_region_pass_bnd_whole_chromosome_pos() { let query = CaseQuery { genomic_region: Some(vec![GenomicRegion::whole_chrom("chr1")]), - ..CaseQuery::new(Database::RefSeq) + ..CaseQuery::default() }; let interpreter = QueryInterpreter::new(query, None); @@ -775,7 +775,7 @@ mod tests { fn test_query_interpreter_passes_genomic_region_pass_bnd_overlap_end() { let query = CaseQuery { genomic_region: Some(vec![GenomicRegion::new("chr2", 150, 160)]), - ..CaseQuery::new(Database::RefSeq) + ..CaseQuery::default() }; let interpreter = QueryInterpreter::new(query, None); @@ -798,7 +798,7 @@ mod tests { fn test_query_interpreter_passes_genomic_region_pass_bnd_whole_chromosome_end() { let query = CaseQuery { genomic_region: Some(vec![GenomicRegion::whole_chrom("chr2")]), - ..CaseQuery::new(Database::RefSeq) + ..CaseQuery::default() }; let interpreter = QueryInterpreter::new(query, None); @@ -821,7 +821,7 @@ mod tests { fn test_query_interpreter_passes_genomic_region_fail_bnd_overlap() { let query = CaseQuery { genomic_region: Some(vec![GenomicRegion::new("chr1", 201, 250)]), - ..CaseQuery::new(Database::RefSeq) + ..CaseQuery::default() }; let interpreter = QueryInterpreter::new(query, None); @@ -844,7 +844,7 @@ mod tests { fn test_query_interpreter_passes_genomic_region_fail_bnd_whole_chromosome() { let query = CaseQuery { genomic_region: Some(vec![GenomicRegion::whole_chrom("chr2")]), - ..CaseQuery::new(Database::RefSeq) + ..CaseQuery::default() }; let interpreter = QueryInterpreter::new(query, None); @@ -880,7 +880,7 @@ mod tests { svdb_g1k_max_count: Some(10), svdb_inhouse_enabled: true, svdb_inhouse_max_count: Some(10), - ..CaseQuery::new(Database::RefSeq) + ..CaseQuery::default() }; let interpreter = QueryInterpreter::new(query, None); @@ -914,7 +914,7 @@ mod tests { svdb_g1k_max_count: Some(10), svdb_inhouse_enabled: true, svdb_inhouse_max_count: Some(10), - ..CaseQuery::new(Database::RefSeq) + ..CaseQuery::default() }; let interpreter = QueryInterpreter::new(query, None); @@ -952,7 +952,7 @@ mod tests { min_sr_var: Some(5), ..GenotypeCriteria::new(GenotypeChoice::Het) }], - ..CaseQuery::new(Database::RefSeq) + ..CaseQuery::default() }; let interpreter = QueryInterpreter::new(query, None); @@ -1102,7 +1102,7 @@ mod tests { min_sr_var: Some(1), ..GenotypeCriteria::new(GenotypeChoice::Variant) }], - ..CaseQuery::new(Database::RefSeq) + ..CaseQuery::default() }; let interpreter = QueryInterpreter::new(query, None); @@ -1163,7 +1163,7 @@ mod tests { min_sr_var: Some(5), ..GenotypeCriteria::new(GenotypeChoice::Het) }], - ..CaseQuery::new(Database::RefSeq) + ..CaseQuery::default() }; let interpreter = QueryInterpreter::new(query, None); @@ -1203,7 +1203,7 @@ mod tests { #[test] fn test_query_interpreter_passes_smoke() -> Result<(), anyhow::Error> { - let query = CaseQuery::new(Database::RefSeq); + let query = CaseQuery::default(); let interpreter = QueryInterpreter::new(query, None); let sv_pass = StructuralVariant { diff --git a/src/strucvars/query/masked.rs b/src/strucvars/query/masked.rs index 820444d0..1dfc5a8a 100644 --- a/src/strucvars/query/masked.rs +++ b/src/strucvars/query/masked.rs @@ -127,7 +127,7 @@ impl BeginEnd for MaskedDbRecord { } } -/// Load background database from a `.bin` file as created by `sv convert-bgdb`. +/// Load background database from a `.bin` file as created by `strucvars txt-to-bin`. #[tracing::instrument] pub fn load_masked_db_records(path: &Path) -> Result { tracing::debug!("loading binary masked db records from {:?}", path); diff --git a/src/strucvars/query/mod.rs b/src/strucvars/query/mod.rs index 14fe2405..c36997fe 100644 --- a/src/strucvars/query/mod.rs +++ b/src/strucvars/query/mod.rs @@ -1,4 +1,4 @@ -//! Code implementing the "sv query" sub command. +//! Code implementing the "strucvars query" sub command. pub mod bgdbs; pub mod clinvar; @@ -36,7 +36,7 @@ use uuid::Uuid; use crate::{ common::{build_chrom_map, numeric_gene_id, trace_rss_now}, - common::{Database, GenomeRelease, TadSet as TadSetChoice}, + common::{GenomeRelease, TadSet as TadSetChoice}, strucvars::query::{ interpreter::QueryInterpreter, pathogenic::Record as KnownPathogenicRecord, schema::CaseQuery, schema::StructuralVariant, @@ -56,9 +56,9 @@ use self::{ /// Length of the upstream/downstream region. static X_STREAM: i32 = 5000; -/// Command line arguments for `sv query` sub command. +/// Command line arguments for `strucvars query` sub command. #[derive(Parser, Debug)] -#[command(author, version, about = "Run query for SVs", long_about = None)] +#[command(author, version, about = "Run query for strucvars", long_about = None)] pub struct Args { /// Genome release to assume. #[arg(long, value_enum)] @@ -175,11 +175,8 @@ struct ResultRecord { payload: String, } -fn resolve_gene_id(database: Database, gene_db: &GeneDb, gene_id: u32) -> Vec { - let record_idxs = match database { - Database::RefSeq => gene_db.xlink.from_entrez.get_vec(&gene_id), - Database::Ensembl => gene_db.xlink.from_ensembl.get_vec(&gene_id), - }; +fn resolve_hgvs_id(gene_db: &GeneDb, hgvs_id: &str) -> Vec { + let record_idxs = gene_db.xlink.from_hgnc.get_vec(hgvs_id); if let Some(record_idxs) = record_idxs { record_idxs .iter() @@ -196,24 +193,14 @@ fn resolve_gene_id(database: Database, gene_db: &GeneDb, gene_id: u32) -> Vec vec![Gene { - entrez_id: Some(gene_id), - symbol: None, - ensembl_id: None, - hgnc_id: None, - is_acmg: gene_db.acmg.contains(gene_id), - is_disease_gene: gene_db.mim2gene.contains(gene_id), - }], - Database::Ensembl => vec![Gene { - ensembl_id: Some(format!("ENSG{gene_id:011}")), - symbol: None, - entrez_id: None, - hgnc_id: None, - is_acmg: false, - is_disease_gene: false, - }], - } + vec![Gene { + entrez_id: None, + symbol: None, + ensembl_id: None, + hgnc_id: Some(hgvs_id.to_string()), + is_acmg: false, + is_disease_gene: false, + }] } } @@ -266,13 +253,15 @@ fn run_query( ..ResultPayload::default() }; - let mut ovl_gene_ids = Vec::new(); + let mut ovl_hgnc_ids = Vec::new(); - let sv_query = if matches!(record_sv.sv_type, SvType::Ins | SvType::Bnd) { - record_sv.pos.saturating_sub(1)..record_sv.pos - } else { - record_sv.pos.saturating_sub(1)..record_sv.end - }; + let chrom = chrom_to_acc + .get(&annonars::common::cli::canonicalize(&record_sv.chrom)) + .expect("invalid chromosome"); + let chrom_idx = *mehari_tx_idx + .contig_to_idx + .get(chrom) + .expect("cannot map idx"); let passes = interpreter.passes( &record_sv, @@ -292,15 +281,18 @@ fn run_query( result_payload.masked_breakpoints.clone() }, &mut |sv: &StructuralVariant| { - ovl_gene_ids = dbs.genes.overlapping_gene_ids( - interpreter.query.database, - *chrom_map - .get(&sv.chrom) - .expect("cannot translate chromosome") as u32, - sv_query.clone(), - ); - ovl_gene_ids.sort(); - ovl_gene_ids.clone() + let sv_query: std::ops::Range = + if matches!(sv.sv_type, SvType::Ins | SvType::Bnd) { + sv.pos.saturating_sub(1)..sv.pos + } else { + sv.pos.saturating_sub(1)..sv.end + }; + + ovl_hgnc_ids = + overlapping_hgnc_ids(mehari_tx_db, mehari_tx_idx, chrom_idx, sv_query); + ovl_hgnc_ids.sort(); + ovl_hgnc_ids.dedup(); + ovl_hgnc_ids.clone() }, &mut |sv: &StructuralVariant| { result_payload.tx_effects = @@ -350,49 +342,46 @@ fn run_query( .collect(); // Get genes in overlapping TADs - let tad_gene_ids = { - let gene_ids: HashSet<_> = HashSet::from_iter(ovl_gene_ids.iter()); + let tad_hgnc_ids = { + let hgnc_ids: HashSet<_> = HashSet::from_iter(ovl_hgnc_ids.iter()); let tads = dbs.tad_sets .overlapping_tads(TadSetChoice::Hesc, &record_sv, &chrom_map); - let mut tad_gene_ids = Vec::new(); + let mut tad_hgvs_ids = Vec::new(); tads.iter() .map(|tad| { - dbs.genes.overlapping_gene_ids( - interpreter.query.database, - tad.chrom_no, + overlapping_hgnc_ids( + mehari_tx_db, + mehari_tx_idx, + chrom_idx, (tad.begin - 1)..tad.end, ) }) - .for_each(|mut v| tad_gene_ids.append(&mut v)); - let tad_gene_ids: HashSet<_> = HashSet::from_iter(tad_gene_ids.into_iter()); - let mut tad_gene_ids = Vec::from_iter(tad_gene_ids); - tad_gene_ids.retain(|gene_id| !gene_ids.contains(gene_id)); - tad_gene_ids.sort(); - tad_gene_ids + .for_each(|mut v| tad_hgvs_ids.append(&mut v)); + let tad_hgvs_ids: HashSet<_> = HashSet::from_iter(tad_hgvs_ids.into_iter()); + let mut tad_hgvs_ids = Vec::from_iter(tad_hgvs_ids); + tad_hgvs_ids.retain(|hgvs_id| !hgnc_ids.contains(hgvs_id)); + tad_hgvs_ids.sort(); + tad_hgvs_ids }; result_payload.tad_boundary_distance = dbs.tad_sets .boundary_dist(TadSetChoice::Hesc, &record_sv, &chrom_map); // Convert the genes into more verbose records and put them into the result - ovl_gene_ids.iter().for_each(|gene_id| { - result_payload.ovl_genes.append(&mut resolve_gene_id( - interpreter.query.database, - &dbs.genes, - *gene_id, - )) + ovl_hgnc_ids.iter().for_each(|hgvs_id| { + result_payload + .ovl_genes + .append(&mut resolve_hgvs_id(&dbs.genes, hgvs_id)) }); result_payload.ovl_disease_gene = result_payload .ovl_genes .iter() .any(|gene| gene.is_disease_gene); - tad_gene_ids.iter().for_each(|gene_id| { - result_payload.tad_genes.append(&mut resolve_gene_id( - interpreter.query.database, - &dbs.genes, - *gene_id, - )) + tad_hgnc_ids.iter().for_each(|hgvs_id| { + result_payload + .tad_genes + .append(&mut resolve_hgvs_id(&dbs.genes, hgvs_id)) }); result_payload.tad_disease_gene = result_payload .tad_genes @@ -814,6 +803,24 @@ fn compute_tx_effects( } } +/// Compute overlapping HGNC gene IDs for a given interval. +pub fn overlapping_hgnc_ids( + tx_seq_db: &TxSeqDatabase, + tx_idx: &TxIntervalTrees, + chrom_idx: usize, + query: std::ops::Range, +) -> Vec { + let tx_db = tx_seq_db + .tx_db + .as_ref() + .expect("transcripts must be present"); + let tree = &tx_idx.trees[chrom_idx]; + tree.find(query.clone()) + .iter() + .map(|it| format!("HGNC:{}", tx_db.transcripts[*it.data() as usize].gene_id)) + .collect::>() +} + /// Bundle the used database to reduce argument count. #[derive(Default, Debug)] pub struct Databases { @@ -826,11 +833,7 @@ pub struct Databases { } /// Translate gene allow list to gene identifier sfrom -fn translate_gene_allowlist( - gene_allowlist: &Vec, - dbs: &Databases, - query: &CaseQuery, -) -> HashSet { +fn translate_gene_allowlist(gene_allowlist: &Vec, dbs: &Databases) -> HashSet { let mut result = HashSet::new(); let re_entrez = regex::Regex::new(r"^\d+").expect("invalid regex in source code"); @@ -839,32 +842,21 @@ fn translate_gene_allowlist( let re_hgnc: regex::Regex = regex::Regex::new(r"HGNC:\d+").expect("invalid regex in source code"); - let symbol_to_id: HashMap<_, _> = - HashMap::from_iter(dbs.genes.xlink.records.iter().map(|record| { - ( - record.symbol.clone(), - match query.database { - Database::RefSeq => record.entrez_id, - Database::Ensembl => record.ensembl_gene_id, - }, - ) - })); + let symbol_to_id: HashMap<_, _> = HashMap::from_iter( + dbs.genes + .xlink + .records + .iter() + .map(|record| (record.symbol.clone(), record.hgnc_id.clone())), + ); for gene in gene_allowlist { let gene = gene.trim(); if re_entrez.is_match(gene) { if let Ok(gene_id) = numeric_gene_id(gene) { - match query.database { - Database::RefSeq => { - result.insert(gene_id); - } - Database::Ensembl => { - if let Some(record_ids) = dbs.genes.xlink.from_ensembl.get_vec(&gene_id) { - for record_id in record_ids { - result - .insert(dbs.genes.xlink.records[*record_id as usize].entrez_id); - } - }; + if let Some(record_ids) = dbs.genes.xlink.from_ensembl.get_vec(&gene_id) { + for record_id in record_ids { + result.insert(dbs.genes.xlink.records[*record_id as usize].hgnc_id.clone()); } } } else { @@ -873,20 +865,11 @@ fn translate_gene_allowlist( } } else if re_ensembl.is_match(gene) { if let Ok(gene_id) = numeric_gene_id(gene) { - match query.database { - Database::RefSeq => { - if let Some(record_ids) = dbs.genes.xlink.from_entrez.get_vec(&gene_id) { - for record_id in record_ids { - result.insert( - dbs.genes.xlink.records[*record_id as usize].ensembl_gene_id, - ); - } - }; - } - Database::Ensembl => { - result.insert(gene_id); + if let Some(record_ids) = dbs.genes.xlink.from_entrez.get_vec(&gene_id) { + for record_id in record_ids { + result.insert(dbs.genes.xlink.records[*record_id as usize].hgnc_id.clone()); } - } + }; } else { warn!("Cannot map candidate ENSEMBL gene identifier {}", &gene); continue; @@ -895,17 +878,7 @@ fn translate_gene_allowlist( if dbs.genes.xlink.from_hgnc.contains_key(gene) { if let Some(record_ids) = dbs.genes.xlink.from_hgnc.get_vec(gene) { for record_id in record_ids { - match query.database { - Database::RefSeq => { - result - .insert(dbs.genes.xlink.records[*record_id as usize].entrez_id); - } - Database::Ensembl => { - result.insert( - dbs.genes.xlink.records[*record_id as usize].ensembl_gene_id, - ); - } - } + result.insert(dbs.genes.xlink.records[*record_id as usize].hgnc_id.clone()); } } } else { @@ -913,7 +886,7 @@ fn translate_gene_allowlist( continue; } } else if let Some(gene_id) = symbol_to_id.get(gene) { - result.insert(*gene_id); + result.insert(gene_id.clone()); } else { warn!("Could not map candidate gene symbol {}", &gene); } @@ -977,6 +950,7 @@ pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow: &args.path_db, &args.genome_release.to_string() ); + tracing::debug!(" path = {}", &path_mehari_tx_db); let mehari_tx_db = mehari::annotate::seqvars::load_tx_db(&path_mehari_tx_db)?; tracing::info!( "...done loading mehari tx database in {:?}", @@ -1003,11 +977,11 @@ pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow: trace_rss_now(); tracing::info!("Translating gene allow list..."); - let gene_allowlist = if let Some(gene_allowlist) = &query.gene_allowlist { + let hgvs_allowlist = if let Some(gene_allowlist) = &query.gene_allowlist { if gene_allowlist.is_empty() { None } else { - Some(translate_gene_allowlist(gene_allowlist, &dbs, &query)) + Some(translate_gene_allowlist(gene_allowlist, &dbs)) } } else { None @@ -1016,7 +990,7 @@ pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow: tracing::info!("Running queries..."); let before_query = Instant::now(); let query_stats = run_query( - &QueryInterpreter::new(query, gene_allowlist), + &QueryInterpreter::new(query, hgvs_allowlist), args, &dbs, &mehari_tx_db, @@ -1038,7 +1012,7 @@ pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow: trace_rss_now(); tracing::info!( - "All of `sv query` completed in {:?}", + "All of `strucvars query` completed in {:?}", before_anything.elapsed() ); Ok(()) @@ -1046,8 +1020,8 @@ pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow: #[cfg(test)] mod test { + // #[tracing_test::traced_test] #[test] - #[tracing_test::traced_test] fn smoke_test() -> Result<(), anyhow::Error> { let tmpdir = temp_testdir::TempDir::default(); let path_output = format!("{}/out.tsv", tmpdir.to_string_lossy()); diff --git a/src/strucvars/query/schema.rs b/src/strucvars/query/schema.rs index 7b25e3b9..f29e1a12 100644 --- a/src/strucvars/query/schema.rs +++ b/src/strucvars/query/schema.rs @@ -1,6 +1,6 @@ //! Supporting code for SV query definition. -use crate::common::{Database, TadSet}; +use crate::common::TadSet; use indexmap::IndexMap; use mehari::annotate::strucvars::{ bnd::Breakend, csq::interface::StrandOrientation, PeOrientation, @@ -1079,9 +1079,6 @@ impl GenotypeCriteria { /// Define a query for structural variants from a case. #[derive(Serialize, Deserialize, PartialEq, Debug, Clone)] pub struct CaseQuery { - /// The transcript database to use - pub database: Database, - /// Whether to enable SVDB overlap queries with DGV. pub svdb_dgv_enabled: bool, /// The minimal reciprocal overlap for querying DGV. @@ -1220,10 +1217,9 @@ where })) } -impl CaseQuery { - pub fn new(database: Database) -> Self { +impl Default for CaseQuery { + fn default() -> Self { CaseQuery { - database, svdb_dgv_enabled: false, svdb_dgv_min_overlap: None, svdb_dgv_max_count: None, @@ -1715,28 +1711,6 @@ mod tests { ); } - #[test] - fn test_genomic_region_serde_smoke() { - assert_tokens( - &Database::RefSeq, - &[Token::UnitVariant { - name: "Database", - variant: "refseq", - }], - ); - } - - #[test] - fn test_database_smoke() { - assert_tokens( - &Database::RefSeq, - &[Token::UnitVariant { - name: "Database", - variant: "refseq", - }], - ); - } - #[test] fn test_sv_type_serde_smoke() { assert_tokens( @@ -1997,7 +1971,7 @@ mod tests { #[test] fn test_case_query_serde_smoke() { - let query: CaseQuery = CaseQuery::new(Database::RefSeq); + let query: CaseQuery = CaseQuery::default(); insta::assert_snapshot!(serde_json::to_string_pretty(&query).unwrap()); } diff --git a/src/strucvars/query/snapshots/varfish_server_worker__strucvars__query__schema__tests__case_query_serde_smoke.snap b/src/strucvars/query/snapshots/varfish_server_worker__strucvars__query__schema__tests__case_query_serde_smoke.snap index 38e6fe1f..e654c669 100644 --- a/src/strucvars/query/snapshots/varfish_server_worker__strucvars__query__schema__tests__case_query_serde_smoke.snap +++ b/src/strucvars/query/snapshots/varfish_server_worker__strucvars__query__schema__tests__case_query_serde_smoke.snap @@ -3,7 +3,6 @@ source: src/strucvars/query/schema.rs expression: "serde_json::to_string_pretty(&query).unwrap()" --- { - "database": "refseq", "svdb_dgv_enabled": false, "svdb_dgv_min_overlap": null, "svdb_dgv_max_count": null, diff --git a/src/strucvars/query/snapshots/varfish_server_worker__strucvars__query__test__smoke_test.snap b/src/strucvars/query/snapshots/varfish_server_worker__strucvars__query__test__smoke_test.snap index f2f81fec..d96dce77 100644 --- a/src/strucvars/query/snapshots/varfish_server_worker__strucvars__query__test__smoke_test.snap +++ b/src/strucvars/query/snapshots/varfish_server_worker__strucvars__query__test__smoke_test.snap @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dbb4c02455aa29e6c74dbc8bf8d229e29ddc8bd89b77782974922c07aa4e2ded -size 709528 +oid sha256:9db992f9b108124349f477c745ee697cee5771036711624a9671b5afce2f8d44 +size 10341 diff --git a/src/strucvars/txt_to_bin/cli.rs b/src/strucvars/txt_to_bin/cli.rs index 57098930..b486144e 100644 --- a/src/strucvars/txt_to_bin/cli.rs +++ b/src/strucvars/txt_to_bin/cli.rs @@ -7,7 +7,7 @@ use clap::Parser; use crate::{ common::trace_rss_now, strucvars::txt_to_bin::{ - clinvar, gene_region, masked, + clinvar, masked, vardbs::{self, InputFileType}, xlink, }, @@ -74,8 +74,6 @@ pub enum InputType { StrucvarG1k, /// Convert gnomAD SV to binary. StrucvarGnomadSv, - /// Convert gene region to binary. - GeneRegion, /// Convert masked region to binary. MaskedRegion, /// Convert cross-link to binary. @@ -100,7 +98,7 @@ pub struct Args { pub path_output: PathBuf, } -/// Main entry point for the `sv bg-db-to-bin` command. +/// Main entry point for the `strucvars txt-to-bin` command. pub fn run(common_args: &crate::common::Args, args: &Args) -> Result<(), anyhow::Error> { tracing::info!("Starting `db to-bin`"); tracing::info!(" common_args = {:?}", &common_args); @@ -140,7 +138,6 @@ pub fn run(common_args: &crate::common::Args, args: &Args) -> Result<(), anyhow: InputType::StrucvarGnomadSv => { vardbs::convert_to_bin(&args.path_input, &args.path_output, InputFileType::Gnomad)? } - InputType::GeneRegion => gene_region::convert_to_bin(&args.path_input, &args.path_output)?, InputType::MaskedRegion => masked::convert_to_bin(&args.path_input, &args.path_output)?, InputType::Xlink => xlink::convert_to_bin(&args.path_input, &args.path_output)?, } @@ -323,26 +320,6 @@ mod test { Ok(()) } - #[test] - fn run_gene_region_smoke() -> Result<(), anyhow::Error> { - let tmp_dir = temp_testdir::TempDir::default(); - let common_args = common::Args { - verbose: Verbosity::new(0, 0), - }; - let args = Args { - assembly: None, - input_type: InputType::GeneRegion, - path_input: String::from( - "tests/db/to-bin/varfish-db-downloader/features/grch37/gene_regions/refseq.bed.gz", - ), - path_output: tmp_dir.join("refseq.bin"), - }; - - super::run(&common_args, &args)?; - - Ok(()) - } - #[test] fn run_masked_region_smoke() -> Result<(), anyhow::Error> { let tmp_dir = temp_testdir::TempDir::default(); diff --git a/src/strucvars/txt_to_bin/gene_region.rs b/src/strucvars/txt_to_bin/gene_region.rs deleted file mode 100644 index 00398c78..00000000 --- a/src/strucvars/txt_to_bin/gene_region.rs +++ /dev/null @@ -1,87 +0,0 @@ -//! Code for converting gene region from text-based to binary format. - -use std::{fs::File, io::Write, path::Path, time::Instant}; - -use mehari::common::open_read_maybe_gz; -use prost::Message; -use thousands::Separable; - -use crate::{ - common::{build_chrom_map, numeric_gene_id, trace_rss_now}, - strucvars::pbs::{GeneRegionDatabase, GeneRegionRecord}, -}; - -/// Module with code supporting the parsing. -mod input { - use serde::Deserialize; - - /// Record as created by VarFish DB Downloader. - #[derive(Debug, Deserialize)] - pub struct Record { - /// Chromosome name - pub chromosome: String, - /// 0-based begin position - pub begin: i32, - /// 1-based end position - pub end: i32, - /// ENSEMBL or Entrez gene ID - pub gene_id: String, - } -} - -/// Perform conversion to protocolbuffers `.bin` file. -pub fn convert_to_bin(path_input_tsv: P, path_output: Q) -> Result<(), anyhow::Error> -where - P: AsRef, - Q: AsRef, -{ - tracing::debug!( - "Converting gene regions from BED {:?} to binary {:?}", - path_input_tsv.as_ref(), - path_output.as_ref() - ); - let chrom_map = build_chrom_map(); - - // Setup CSV reader for BED file - header is written as comment and must be - // ignored. - let mut reader = csv::ReaderBuilder::new() - .has_headers(false) - .delimiter(b'\t') - .comment(Some(b'#')) - .from_reader(open_read_maybe_gz(path_input_tsv.as_ref())?); - let before_parsing = Instant::now(); - - let mut records = Vec::new(); - for record in reader.deserialize() { - let record: input::Record = record?; - records.push(GeneRegionRecord { - chrom_no: *chrom_map - .get(&record.chromosome) - .unwrap_or_else(|| panic!("unknown chrom {:?}", &record.chromosome)) - as i32, - start: record.begin + 1, - stop: record.end, - gene_id: numeric_gene_id(&record.gene_id)?, - }); - } - let gene_region_db = GeneRegionDatabase { records }; - - tracing::debug!( - "total time spent reading {:?} records: {:?}", - gene_region_db.records.len().separate_with_commas(), - before_parsing.elapsed() - ); - trace_rss_now(); - - let before_writing = Instant::now(); - let mut output_file = File::create(&path_output)?; - output_file.write_all(&gene_region_db.encode_to_vec())?; - output_file.flush()?; - tracing::debug!( - "total time spent writing {} records: {:?}", - gene_region_db.records.len().separate_with_commas(), - before_writing.elapsed() - ); - - Ok(()) -} diff --git a/src/strucvars/txt_to_bin/mod.rs b/src/strucvars/txt_to_bin/mod.rs index a4c19aa7..f337ae13 100644 --- a/src/strucvars/txt_to_bin/mod.rs +++ b/src/strucvars/txt_to_bin/mod.rs @@ -2,7 +2,6 @@ pub mod cli; pub mod clinvar; -pub mod gene_region; pub mod masked; pub mod vardbs; pub mod xlink; diff --git a/tests/strucvars/query/Case_3.ingested.vcf b/tests/strucvars/query/Case_3.ingested.vcf index 0b9f2292..da1e4712 100644 --- a/tests/strucvars/query/Case_3.ingested.vcf +++ b/tests/strucvars/query/Case_3.ingested.vcf @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c526a6ebd03471dcf1e7b0b86e9c0911d27d02dc6f06a845b531c10005004f0 -size 10557593 +oid sha256:536b6d2cd9415825e1c62c17f9fc91098e46c4b7b975409611b823bc2d808559 +size 448994 diff --git a/tests/strucvars/query/db/worker/grch37/genes/ensembl_regions.bin b/tests/strucvars/query/db/worker/grch37/genes/ensembl_regions.bin deleted file mode 100644 index e3564b8a..00000000 --- a/tests/strucvars/query/db/worker/grch37/genes/ensembl_regions.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7ae4dce528abe91b2430443221912a532122962d020a9f667cdc4ee65cf76abb -size 1026161 diff --git a/tests/strucvars/query/db/worker/grch37/genes/refseq_regions.bin b/tests/strucvars/query/db/worker/grch37/genes/refseq_regions.bin deleted file mode 100644 index a18852c8..00000000 --- a/tests/strucvars/query/db/worker/grch37/genes/refseq_regions.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:af07bad558f76a7dd52c26a8f9b36c7d8d52884cfe300fe15c9ce17385ab3727 -size 784004