Skip to content

Commit

Permalink
feat: remove gene regions from "strucvars query" database enhancement (
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Oct 11, 2023
1 parent 1fb3c58 commit 28663b6
Show file tree
Hide file tree
Showing 18 changed files with 156 additions and 509 deletions.
24 changes: 0 additions & 24 deletions src/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -514,30 +514,6 @@ where
Ok(buffer)
}

/// Enum for the supported gene/transcript databases.
#[derive(
serde::Serialize,
serde::Deserialize,
enum_map::Enum,
PartialEq,
Eq,
Clone,
Copy,
Debug,
Default,
strum::EnumString,
strum::Display,
)]
#[serde(rename_all = "lowercase")]
#[strum(serialize_all = "lowercase")]
pub enum Database {
/// RefSeq
#[default]
RefSeq,
/// ENSEMBL
Ensembl,
}

/// Enum for the supported TADs.
#[derive(
serde::Serialize,
Expand Down
6 changes: 3 additions & 3 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ enum Commands {
Seqvars(Seqvars),
}

/// Parsing of "sv *" sub commands.
/// Parsing of "strucvars *" sub commands.
#[derive(Debug, Args)]
#[command(args_conflicts_with_subcommands = true)]
struct Strucvars {
Expand All @@ -44,7 +44,7 @@ struct Strucvars {
command: StrucvarsCommands,
}

/// Enum supporting the parsing of "sv *" sub commands.
/// Enum supporting the parsing of "strucvars *" sub commands.
#[derive(Debug, Subcommand)]
enum StrucvarsCommands {
Aggregate(strucvars::aggregate::cli::Args),
Expand All @@ -62,7 +62,7 @@ struct Seqvars {
command: SeqvarsCommands,
}

/// Enum supporting the parsing of "sv *" sub commands.
/// Enum supporting the parsing of "strucvars *" sub commands.
#[derive(Debug, Subcommand)]
enum SeqvarsCommands {
Aggregate(seqvars::aggregate::Args),
Expand Down
18 changes: 0 additions & 18 deletions src/proto/varfish/v1/sv.proto
Original file line number Diff line number Diff line change
Expand Up @@ -56,24 +56,6 @@ message MaskedDatabase {
repeated MaskedDbRecord records = 1;
}

// Record for the gene region database.
message GeneRegionRecord {
// Numeric chromosome number.
int32 chrom_no = 1;
// 1-based start position.
int32 start = 2;
// 1-based stop position.
int32 stop = 3;
// Numeric gene identifier.
uint32 gene_id = 4;
}

// Gene region database.
message GeneRegionDatabase {
// List of gene region records.
repeated GeneRegionRecord records = 1;
}

// Record for the gene cross-link database.
message XlinkRecord {
// HGNC ID (e.g., "HGNC:123")
Expand Down
2 changes: 1 addition & 1 deletion src/strucvars/aggregate/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ pub struct Args {
pub slack_ins: i32,
}

/// Main entry point for the `sv bg-db-to-bin` command.
/// Main entry point for the `strucvars txt-to-bin` command.
pub fn run(common_args: &crate::common::Args, args: &Args) -> Result<(), anyhow::Error> {
tracing::info!("Starting `strucvars aggregate`");
tracing::info!(" common_args = {:?}", &common_args);
Expand Down
2 changes: 1 addition & 1 deletion src/strucvars/query/bgdbs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ impl BeginEnd for BgDbRecord {
}
}

/// Load background database from a `.bin` file as created by `sv convert-bgdb`.
/// Load background database from a `.bin` file as created by `strucvar txt-to-bin`.
#[tracing::instrument]
pub fn load_bg_db_records(path: &Path) -> Result<BgDb, anyhow::Error> {
tracing::debug!("loading binary bg db records from {:?}", path);
Expand Down
145 changes: 2 additions & 143 deletions src/strucvars/query/genes.rs
Original file line number Diff line number Diff line change
@@ -1,90 +1,13 @@
//! Code for supporting annotation with overlapping genes.

use std::{collections::HashSet, ops::Range, path::Path, time::Instant};
use std::{collections::HashSet, path::Path, time::Instant};

use bio::data_structures::interval_tree::ArrayBackedIntervalTree;
use mehari::common::open_read_maybe_gz;
use prost::Message;
use serde::Deserialize;
use tracing::info;

use crate::{
common::{Database, GenomeRelease, CHROMS},
strucvars::pbs,
};

/// Alias for the interval tree that we use.
type IntervalTree = ArrayBackedIntervalTree<i32, u32>;

/// Information to store for a TAD set entry.
#[derive(Default, Clone, Debug)]
pub struct GeneRegionRecord {
/// 0-based begin position.
pub begin: i32,
/// End position.
pub end: i32,
/// gene ID
pub gene_id: u32,
}

/// Gene region overlapping information.
#[derive(Default, Debug)]
pub struct GeneRegionDb {
/// Records, stored by chromosome.
pub records: Vec<Vec<GeneRegionRecord>>,
/// Interval trees, stored by chromosome.
pub trees: Vec<IntervalTree>,
}

impl GeneRegionDb {
/// Return IDs of overlapping genes.
pub fn overlapping_gene_ids(&self, chrom_no: u32, query: Range<i32>) -> Vec<u32> {
self.trees[chrom_no as usize]
.find(query)
.iter()
.map(|cursor| self.records[chrom_no as usize][*cursor.data() as usize].gene_id)
.collect()
}
}

#[tracing::instrument]
fn load_gene_regions_db(path: &Path) -> Result<GeneRegionDb, anyhow::Error> {
tracing::debug!("loading binary gene region records from {:?}...", path);

let before_loading = Instant::now();
let mut result = GeneRegionDb::default();
for _ in CHROMS {
result.records.push(Vec::new());
result.trees.push(IntervalTree::new());
}

let fcontents =
std::fs::read(path).map_err(|e| anyhow::anyhow!("error reading {:?}: {}", &path, e))?;
let gene_region_db = pbs::GeneRegionDatabase::decode(std::io::Cursor::new(fcontents))
.map_err(|e| anyhow::anyhow!("error decoding {:?}: {}", &path, e))?;

let mut total_count = 0;
for record in gene_region_db.records.into_iter() {
let chrom_no = record.chrom_no as usize;
let key = (record.start - 1)..record.stop;
result.trees[chrom_no].insert(key, result.records[chrom_no].len() as u32);
result.records[chrom_no].push(GeneRegionRecord {
begin: record.start - 1,
end: record.stop,
gene_id: record.gene_id,
});

total_count += 1;
}
result.trees.iter_mut().for_each(|tree| tree.index());
tracing::debug!(
"... done loading {} records and building trees in {:?}",
total_count,
before_loading.elapsed(),
);

Ok(result)
}
use crate::{common::GenomeRelease, strucvars::pbs};

/// Information to store for the interlink table.
#[derive(Default, Debug)]
Expand All @@ -108,43 +31,6 @@ pub struct XlinkDb {
pub from_hgnc: multimap::MultiMap<String, u32>,
}

impl XlinkDb {
fn entrez_id_to_symbols(&self, entrez_id: u32) -> Vec<String> {
let mut tmp = self
.from_entrez
.get_vec(&entrez_id)
.map_or(Vec::new(), |idxs| {
idxs.iter()
.map(|idx| self.records[*idx as usize].symbol.clone())
.collect::<Vec<String>>()
});
tmp.sort();
tmp.dedup();
tmp
}

fn ensembl_to_symbols(&self, ensembl_id: u32) -> Vec<String> {
let mut tmp = self
.from_ensembl
.get_vec(&ensembl_id)
.map_or(Vec::new(), |idxs| {
idxs.iter()
.map(|idx| self.records[*idx as usize].symbol.clone())
.collect::<Vec<String>>()
});
tmp.sort();
tmp.dedup();
tmp
}

pub fn gene_id_to_symbols(&self, database: Database, gene_id: u32) -> Vec<String> {
match database {
Database::RefSeq => self.entrez_id_to_symbols(gene_id),
Database::Ensembl => self.ensembl_to_symbols(gene_id),
}
}
}

#[tracing::instrument]
fn load_xlink_db(path: &Path) -> Result<XlinkDb, anyhow::Error> {
tracing::debug!("loading binary xlink records from {:?}...", path);
Expand Down Expand Up @@ -287,44 +173,17 @@ fn load_mim2gene_db(path: &Path) -> Result<OmimDb, anyhow::Error> {
/// Bundle of gene region DBs and the xlink info packaged with VarFish.
#[derive(Default, Debug)]
pub struct GeneDb {
pub refseq: GeneRegionDb,
pub ensembl: GeneRegionDb,
pub xlink: XlinkDb,
pub acmg: AcmgDb,
pub mim2gene: OmimDb,
}

impl GeneDb {
/// Return IDs of overlapping genes.
pub fn overlapping_gene_ids(
&self,
database: Database,
chrom_no: u32,
query: Range<i32>,
) -> Vec<u32> {
match database {
Database::RefSeq => self.refseq.overlapping_gene_ids(chrom_no, query),
Database::Ensembl => self.ensembl.overlapping_gene_ids(chrom_no, query),
}
}
}

// Load all gene information, such as region, id mapping and symbols.
#[tracing::instrument]
pub fn load_gene_db(path_db: &str, genome_release: GenomeRelease) -> Result<GeneDb, anyhow::Error> {
info!("Loading gene dbs");

let result = GeneDb {
refseq: load_gene_regions_db(
Path::new(path_db)
.join(format!("{}/genes/refseq_regions.bin", genome_release))
.as_path(),
)?,
ensembl: load_gene_regions_db(
Path::new(path_db)
.join(format!("{}/genes/ensembl_regions.bin", genome_release))
.as_path(),
)?,
xlink: load_xlink_db(Path::new(path_db).join("noref/genes/xlink.bin").as_path())?,
acmg: load_acmg_db(Path::new(path_db).join("noref/genes/acmg.tsv").as_path())?,
mim2gene: load_mim2gene_db(
Expand Down
Loading

0 comments on commit 28663b6

Please sign in to comment.