Skip to content

Commit

Permalink
feat: allow importing gnomAD SV/CNV v4 as background db (#295)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe committed Feb 27, 2024
1 parent 43413d4 commit 61ab476
Show file tree
Hide file tree
Showing 10 changed files with 3,748 additions and 16 deletions.
8 changes: 5 additions & 3 deletions src/strucvars/txt_to_bin/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,9 +135,11 @@ pub fn run(common_args: &crate::common::Args, args: &Args) -> Result<(), anyhow:
InputType::StrucvarG1k => {
vardbs::convert_to_bin(&args.path_input, &args.path_output, InputFileType::G1k)?
}
InputType::StrucvarGnomadSv => {
vardbs::convert_to_bin(&args.path_input, &args.path_output, InputFileType::Gnomad)?
}
InputType::StrucvarGnomadSv => vardbs::convert_to_bin(
&args.path_input,
&args.path_output,
InputFileType::GnomadSv2,
)?,
InputType::MaskedRegion => masked::convert_to_bin(&args.path_input, &args.path_output)?,
InputType::Xlink => xlink::convert_to_bin(&args.path_input, &args.path_output)?,
}
Expand Down
56 changes: 53 additions & 3 deletions src/strucvars/txt_to_bin/vardbs/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,43 @@ pub struct G1kRecord {
pub n_het: u32,
}

/// gnomAD SV database record as read from TSV file.
/// gnomAD SV v2 database record as read from TSV file.
#[derive(Debug, Deserialize)]
pub struct GnomadRecord {
pub struct GnomadSv2Record {
/// chromosome name
pub chromosome: String,
/// begin position, 0-based
pub begin: i32,
/// end position, 0-based
pub end: i32,
/// The structural vairant type
pub svtype: String,
/// Number of homozygous alternative carriers
pub n_homalt: u32,
/// Number of heterozygous carriers
pub n_het: u32,
}

/// gnomAD SV v4 database record as read from TSV file.
#[derive(Debug, Deserialize)]
pub struct GnomadSv4Record {
/// chromosome name
pub chromosome: String,
/// begin position, 0-based
pub begin: i32,
/// end position, 0-based
pub end: i32,
/// The structural vairant type
pub svtype: String,
/// Number of homozygous alternative carriers
pub n_homalt: u32,
/// Number of heterozygous carriers
pub n_het: u32,
}

/// gnomAD CNV v$ database record as read from TSV file.
#[derive(Debug, Deserialize)]
pub struct GnomadCnv4Record {
/// chromosome name
pub chromosome: String,
/// begin position, 0-based
Expand Down Expand Up @@ -251,7 +285,7 @@ impl TryInto<Option<InputRecord>> for ExacRecord {
}
}

impl TryInto<Option<InputRecord>> for GnomadRecord {
impl TryInto<Option<InputRecord>> for GnomadSv2Record {
type Error = &'static str;

fn try_into(self) -> Result<Option<InputRecord>, Self::Error> {
Expand Down Expand Up @@ -279,6 +313,22 @@ impl TryInto<Option<InputRecord>> for GnomadRecord {
}
}

impl TryInto<Option<InputRecord>> for GnomadCnv4Record {
type Error = &'static str;

fn try_into(self) -> Result<Option<InputRecord>, Self::Error> {
todo!()
}
}

impl TryInto<Option<InputRecord>> for GnomadSv4Record {
type Error = &'static str;

fn try_into(self) -> Result<Option<InputRecord>, Self::Error> {
todo!()
}
}

impl TryInto<Option<InputRecord>> for G1kRecord {
type Error = &'static str;

Expand Down
102 changes: 92 additions & 10 deletions src/strucvars/txt_to_bin/vardbs/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ pub enum InputFileType {
DgvGs,
Exac,
G1k,
Gnomad,
GnomadSv2,
GnomadCnv4,
GnomadSv4,
InhouseDb,
}
/// Deserialize from CSV reader to an `Option<records::InputRecord>`
Expand Down Expand Up @@ -76,6 +78,24 @@ where
Ok(result)
}

/// Branch around `deserialize_loop`.
pub fn deserialize_branch(
input_type: InputFileType,
reader: &mut csv::Reader<Box<dyn std::io::BufRead>>,
) -> Result<Vec<BgDbRecord>, anyhow::Error> {
match input_type {
InputFileType::Dbvar => deserialize_loop::<input::DbVarRecord>(reader),
InputFileType::Dgv => deserialize_loop::<input::DgvRecord>(reader),
InputFileType::DgvGs => deserialize_loop::<input::DgvGsRecord>(reader),
InputFileType::Exac => deserialize_loop::<input::ExacRecord>(reader),
InputFileType::G1k => deserialize_loop::<input::G1kRecord>(reader),
InputFileType::InhouseDb => deserialize_loop::<InhouseDbRecord>(reader),
InputFileType::GnomadSv2 => deserialize_loop::<input::GnomadSv2Record>(reader),
InputFileType::GnomadCnv4 => deserialize_loop::<input::GnomadCnv4Record>(reader),
InputFileType::GnomadSv4 => deserialize_loop::<input::GnomadSv4Record>(reader),
}
}

/// Perform conversion to protobuf `.bin` file.
pub fn convert_to_bin<P, Q>(
path_input_tsv: P,
Expand All @@ -97,15 +117,7 @@ where
)?);
let before_parsing = Instant::now();

let records = match input_type {
InputFileType::Dbvar => deserialize_loop::<input::DbVarRecord>(&mut reader)?,
InputFileType::Dgv => deserialize_loop::<input::DgvRecord>(&mut reader)?,
InputFileType::DgvGs => deserialize_loop::<input::DgvGsRecord>(&mut reader)?,
InputFileType::Exac => deserialize_loop::<input::ExacRecord>(&mut reader)?,
InputFileType::G1k => deserialize_loop::<input::G1kRecord>(&mut reader)?,
InputFileType::Gnomad => deserialize_loop::<input::GnomadRecord>(&mut reader)?,
InputFileType::InhouseDb => deserialize_loop::<InhouseDbRecord>(&mut reader)?,
};
let records = deserialize_branch(input_type, &mut reader)?;
let bg_db = BackgroundDatabase { records };

tracing::debug!(
Expand All @@ -127,3 +139,73 @@ where

Ok(())
}

#[cfg(test)]
mod test {
use super::InputFileType;

#[rstest::rstest]
#[case(
InputFileType::Dbvar,
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/dbvar.bed.gz"
)]
#[case(
InputFileType::Dgv,
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/dgv.bed.gz"
)]
#[case(
InputFileType::DgvGs,
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/dgv_gs.bed.gz"
)]
#[case(
InputFileType::Exac,
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/exac.bed.gz"
)]
#[case(
InputFileType::G1k,
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/g1k.bed.gz"
)]
#[case(
InputFileType::GnomadSv2,
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/gnomad_sv.bed.gz"
)]
// #[case(
// InputFileType::GnomadCnv4,
// "tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/gnomad_cnv4.bed.gz",
// )]
// #[case(
// InputFileType::GnomadSv4,
// "tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/gnomad_sv4.bed.gz",
// )]
#[case(
InputFileType::InhouseDb,
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/inhouse.tsv"
)]
fn test_deserialize_branch(
#[case] input_type: InputFileType,
#[case] path_input: &str,
) -> Result<(), anyhow::Error> {
mehari::common::set_snapshot_suffix!(
"{:?}-{}",
input_type,
path_input
.split('/')
.last()
.unwrap()
.split('.')
.next()
.unwrap()
);

let mut reader = csv::ReaderBuilder::new()
.has_headers(false)
.comment(Some(b'#'))
.delimiter(b'\t')
.from_reader(mehari::common::io::std::open_read_maybe_gz(path_input)?);

let records = super::deserialize_branch(input_type, &mut reader)?;
insta::assert_yaml_snapshot!(records);

Ok(())
}
}
Loading

0 comments on commit 61ab476

Please sign in to comment.