Skip to content

Commit

Permalink
feat: allow importing gnomAD SV/CNV v4 as background db (#295) (#310)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Feb 27, 2024
1 parent 43413d4 commit ce9b391
Show file tree
Hide file tree
Showing 20 changed files with 18,458 additions and 21 deletions.
72 changes: 64 additions & 8 deletions src/strucvars/txt_to_bin/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,12 @@ pub enum InputType {
StrucvarExacCnv,
/// Convert Thousand Genomes to binary.
StrucvarG1k,
/// Convert gnomAD SV to binary.
StrucvarGnomadSv,
/// Convert gnomAD SV v2 to binary.
StrucvarGnomadSv2,
/// Convert gnomAD CNV v4 to binary.
StrucvarGnomadCnv4,
/// Convert gnomAD SV v4 to binary.
StrucvarGnomadSv4,
/// Convert masked region to binary.
MaskedRegion,
/// Convert cross-link to binary.
Expand All @@ -100,7 +104,7 @@ pub struct Args {

/// Main entry point for the `strucvars txt-to-bin` command.
pub fn run(common_args: &crate::common::Args, args: &Args) -> Result<(), anyhow::Error> {
tracing::info!("Starting `db to-bin`");
tracing::info!("Starting `strucvars txt-to-bin`");
tracing::info!(" common_args = {:?}", &common_args);
tracing::info!(" args = {:?}", &args);

Expand Down Expand Up @@ -135,9 +139,21 @@ pub fn run(common_args: &crate::common::Args, args: &Args) -> Result<(), anyhow:
InputType::StrucvarG1k => {
vardbs::convert_to_bin(&args.path_input, &args.path_output, InputFileType::G1k)?
}
InputType::StrucvarGnomadSv => {
vardbs::convert_to_bin(&args.path_input, &args.path_output, InputFileType::Gnomad)?
}
InputType::StrucvarGnomadSv2 => vardbs::convert_to_bin(
&args.path_input,
&args.path_output,
InputFileType::GnomadSv2,
)?,
InputType::StrucvarGnomadCnv4 => vardbs::convert_to_bin(
&args.path_input,
&args.path_output,
InputFileType::GnomadCnv4,
)?,
InputType::StrucvarGnomadSv4 => vardbs::convert_to_bin(
&args.path_input,
&args.path_output,
InputFileType::GnomadSv4,
)?,
InputType::MaskedRegion => masked::convert_to_bin(&args.path_input, &args.path_output)?,
InputType::Xlink => xlink::convert_to_bin(&args.path_input, &args.path_output)?,
}
Expand Down Expand Up @@ -301,14 +317,14 @@ mod test {
}

#[test]
fn run_strucvar_gnomad_smoke() -> Result<(), anyhow::Error> {
fn run_strucvar_gnomad_sv2_smoke() -> Result<(), anyhow::Error> {
let tmp_dir = temp_testdir::TempDir::default();
let common_args = common::Args {
verbose: Verbosity::new(0, 0),
};
let args = Args {
assembly: None,
input_type: InputType::StrucvarGnomadSv,
input_type: InputType::StrucvarGnomadSv2,
path_input: String::from(
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/gnomad_sv.bed.gz",
),
Expand All @@ -320,6 +336,46 @@ mod test {
Ok(())
}

#[test]
fn run_strucvar_gnomad_cnv4_smoke() -> Result<(), anyhow::Error> {
let tmp_dir = temp_testdir::TempDir::default();
let common_args = common::Args {
verbose: Verbosity::new(0, 0),
};
let args = Args {
assembly: None,
input_type: InputType::StrucvarGnomadCnv4,
path_input: String::from(
"tests/db/to-bin/varfish-db-downloader/vardbs/grch38/strucvar/gnomad-cnv.bed.gz",
),
path_output: tmp_dir.join("gnomad-cnv.bin"),
};

super::run(&common_args, &args)?;

Ok(())
}

#[test]
fn run_strucvar_gnomad_sv4_smoke() -> Result<(), anyhow::Error> {
let tmp_dir = temp_testdir::TempDir::default();
let common_args = common::Args {
verbose: Verbosity::new(0, 0),
};
let args = Args {
assembly: None,
input_type: InputType::StrucvarGnomadSv4,
path_input: String::from(
"tests/db/to-bin/varfish-db-downloader/vardbs/grch38/strucvar/gnomad-sv.bed.gz",
),
path_output: tmp_dir.join("gnomad-sv.bin"),
};

super::run(&common_args, &args)?;

Ok(())
}

#[test]
fn run_masked_region_smoke() -> Result<(), anyhow::Error> {
let tmp_dir = temp_testdir::TempDir::default();
Expand Down
109 changes: 106 additions & 3 deletions src/strucvars/txt_to_bin/vardbs/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,9 @@ pub struct G1kRecord {
pub n_het: u32,
}

/// gnomAD SV database record as read from TSV file.
/// gnomAD SV v2 database record as read from TSV file.
#[derive(Debug, Deserialize)]
pub struct GnomadRecord {
pub struct GnomadSv2Record {
/// chromosome name
pub chromosome: String,
/// begin position, 0-based
Expand All @@ -102,6 +102,56 @@ pub struct GnomadRecord {
pub n_het: u32,
}

/// gnomAD SV v4 database record as read from TSV file.
#[derive(Debug, Deserialize)]
pub struct GnomadSv4Record {
/// chromosome name
pub chromosome: String,
/// begin position, 0-based
pub begin: i32,
/// end position, 0-based
pub end: i32,
/// The structural vairant type
pub svtype: String,
/// Number of male homozygous reference allele carriers.
pub male_n_homref: u32,
/// Number of male heterozygous alternate allele carriers.
pub male_n_het: u32,
/// Number of male homozygous alternate allele carriers.
pub male_n_homalt: u32,
/// Number of male hemizygous alternate allele carriers.
pub male_n_hemiref: u32,
/// Number of male hemizygous reference allele carriers.
pub male_n_hemialt: u32,
/// Number of female homozygous reference allele carriers.
pub female_n_homref: u32,
/// Number of female heterozygous alternate allele carriers.
pub female_n_het: u32,
/// Number of female homozygous alternate allele carriers.
pub female_n_homalt: u32,
/// Number of samples at this site (CNV only).
pub cnv_n_total: u32,
/// Number of samples with a CNV at this site (CNV only).
pub cnv_n_var: u32,
}

/// gnomAD CNV v$ database record as read from TSV file.
#[derive(Debug, Deserialize)]
pub struct GnomadCnv4Record {
/// chromosome name
pub chromosome: String,
/// begin position, 0-based
pub begin: i32,
/// end position, 0-based
pub end: i32,
/// The structural vairant type
pub svtype: String,
/// Number of samples at this site (passing QC).
pub n_total: u32,
/// Number of samples with a CNV at this site (passing QC).
pub n_var: u32,
}

/// Common type to convert input data to.
pub struct InputRecord {
/// Chromosome of start position.
Expand Down Expand Up @@ -251,7 +301,7 @@ impl TryInto<Option<InputRecord>> for ExacRecord {
}
}

impl TryInto<Option<InputRecord>> for GnomadRecord {
impl TryInto<Option<InputRecord>> for GnomadSv2Record {
type Error = &'static str;

fn try_into(self) -> Result<Option<InputRecord>, Self::Error> {
Expand Down Expand Up @@ -279,6 +329,59 @@ impl TryInto<Option<InputRecord>> for GnomadRecord {
}
}

impl TryInto<Option<InputRecord>> for GnomadCnv4Record {
type Error = &'static str;

fn try_into(self) -> Result<Option<InputRecord>, Self::Error> {
Ok(Some(InputRecord {
chromosome: self.chromosome.clone(),
chromosome2: self.chromosome,
begin: self.begin,
end: self.end,
sv_type: match self.svtype.as_str() {
"DEL" => SvType::Del,
"DUP" => SvType::Dup,
_ => {
error!("sv_type = {}", &self.svtype);
return Err("unknown SV type");
}
},
count: self.n_var,
}))
}
}

impl TryInto<Option<InputRecord>> for GnomadSv4Record {
type Error = &'static str;

fn try_into(self) -> Result<Option<InputRecord>, Self::Error> {
Ok(Some(InputRecord {
chromosome: self.chromosome.clone(),
chromosome2: self.chromosome,
begin: self.begin,
end: self.end,
sv_type: match self.svtype.as_str() {
"BND" => SvType::Bnd,
"CNV" => SvType::Cnv,
"DEL" => SvType::Del,
"DUP" => SvType::Dup,
"INS" => SvType::Ins,
"INV" => SvType::Inv,
_ => {
error!("sv_type = {}", &self.svtype);
return Err("unknown SV type");
}
},
count: self.male_n_het
+ self.male_n_homalt
+ self.male_n_hemialt
+ self.female_n_het
+ self.female_n_homalt
+ self.cnv_n_var,
}))
}
}

impl TryInto<Option<InputRecord>> for G1kRecord {
type Error = &'static str;

Expand Down
102 changes: 92 additions & 10 deletions src/strucvars/txt_to_bin/vardbs/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ pub enum InputFileType {
DgvGs,
Exac,
G1k,
Gnomad,
GnomadSv2,
GnomadCnv4,
GnomadSv4,
InhouseDb,
}
/// Deserialize from CSV reader to an `Option<records::InputRecord>`
Expand Down Expand Up @@ -76,6 +78,24 @@ where
Ok(result)
}

/// Branch around `deserialize_loop`.
pub fn deserialize_branch(
input_type: InputFileType,
reader: &mut csv::Reader<Box<dyn std::io::BufRead>>,
) -> Result<Vec<BgDbRecord>, anyhow::Error> {
match input_type {
InputFileType::Dbvar => deserialize_loop::<input::DbVarRecord>(reader),
InputFileType::Dgv => deserialize_loop::<input::DgvRecord>(reader),
InputFileType::DgvGs => deserialize_loop::<input::DgvGsRecord>(reader),
InputFileType::Exac => deserialize_loop::<input::ExacRecord>(reader),
InputFileType::G1k => deserialize_loop::<input::G1kRecord>(reader),
InputFileType::InhouseDb => deserialize_loop::<InhouseDbRecord>(reader),
InputFileType::GnomadSv2 => deserialize_loop::<input::GnomadSv2Record>(reader),
InputFileType::GnomadCnv4 => deserialize_loop::<input::GnomadCnv4Record>(reader),
InputFileType::GnomadSv4 => deserialize_loop::<input::GnomadSv4Record>(reader),
}
}

/// Perform conversion to protobuf `.bin` file.
pub fn convert_to_bin<P, Q>(
path_input_tsv: P,
Expand All @@ -97,15 +117,7 @@ where
)?);
let before_parsing = Instant::now();

let records = match input_type {
InputFileType::Dbvar => deserialize_loop::<input::DbVarRecord>(&mut reader)?,
InputFileType::Dgv => deserialize_loop::<input::DgvRecord>(&mut reader)?,
InputFileType::DgvGs => deserialize_loop::<input::DgvGsRecord>(&mut reader)?,
InputFileType::Exac => deserialize_loop::<input::ExacRecord>(&mut reader)?,
InputFileType::G1k => deserialize_loop::<input::G1kRecord>(&mut reader)?,
InputFileType::Gnomad => deserialize_loop::<input::GnomadRecord>(&mut reader)?,
InputFileType::InhouseDb => deserialize_loop::<InhouseDbRecord>(&mut reader)?,
};
let records = deserialize_branch(input_type, &mut reader)?;
let bg_db = BackgroundDatabase { records };

tracing::debug!(
Expand All @@ -127,3 +139,73 @@ where

Ok(())
}

#[cfg(test)]
mod test {
use super::InputFileType;

#[rstest::rstest]
#[case::dbvar(
InputFileType::Dbvar,
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/dbvar.bed.gz"
)]
#[case::dgv(
InputFileType::Dgv,
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/dgv.bed.gz"
)]
#[case::dgv_gs(
InputFileType::DgvGs,
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/dgv_gs.bed.gz"
)]
#[case::exac(
InputFileType::Exac,
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/exac.bed.gz"
)]
#[case::g1k(
InputFileType::G1k,
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/g1k.bed.gz"
)]
#[case::gnomad_sv2(
InputFileType::GnomadSv2,
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/gnomad_sv.bed.gz"
)]
#[case::gnomad_cnv4(
InputFileType::GnomadCnv4,
"tests/db/to-bin/varfish-db-downloader/vardbs/grch38/strucvar/gnomad-cnv.bed.gz"
)]
#[case::gnomad_sv4(
InputFileType::GnomadSv4,
"tests/db/to-bin/varfish-db-downloader/vardbs/grch38/strucvar/gnomad-sv.bed.gz"
)]
#[case::inhouse_db(
InputFileType::InhouseDb,
"tests/db/to-bin/varfish-db-downloader/vardbs/grch37/strucvar/inhouse.tsv"
)]
fn test_deserialize_branch(
#[case] input_type: InputFileType,
#[case] path_input: &str,
) -> Result<(), anyhow::Error> {
mehari::common::set_snapshot_suffix!(
"{:?}-{}",
input_type,
path_input
.split('/')
.last()
.unwrap()
.split('.')
.next()
.unwrap()
);

let mut reader = csv::ReaderBuilder::new()
.has_headers(false)
.comment(Some(b'#'))
.delimiter(b'\t')
.from_reader(mehari::common::io::std::open_read_maybe_gz(path_input)?);

let records = super::deserialize_branch(input_type, &mut reader)?;
insta::assert_yaml_snapshot!(records);

Ok(())
}
}
Loading

0 comments on commit ce9b391

Please sign in to comment.