Skip to content

Commit

Permalink
feat: implement ingest for Clair3+GLNexus (#297) (#311)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Feb 27, 2024
1 parent ce9b391 commit 4144003
Show file tree
Hide file tree
Showing 8 changed files with 330 additions and 13 deletions.
76 changes: 70 additions & 6 deletions src/seqvars/ingest/header.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,19 @@ use crate::common::GenomeRelease;
/// Enumeration for the known variant callers.
#[derive(Debug, Clone, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
pub enum VariantCaller {
GatkHaplotypeCaller { version: String },
GatkUnifiedGenotyper { version: String },
Dragen { version: String },
GatkHaplotypeCaller {
version: String,
},
GatkUnifiedGenotyper {
version: String,
},
Glnexus {
version: String,
config_name: Option<String>,
},
Dragen {
version: String,
},
Other,
}

Expand All @@ -20,19 +30,24 @@ impl VariantCaller {
VariantCaller::GatkHaplotypeCaller { .. } => "GatkHaplotypeCaller",
VariantCaller::GatkUnifiedGenotyper { .. } => "GatkUnifiedGenotyper",
VariantCaller::Dragen { .. } => "Dragen",
VariantCaller::Glnexus { .. } => "Glnexus",
VariantCaller::Other => "Other",
}
}
}

impl VariantCaller {
pub fn guess(header: &vcf::Header) -> Option<Self> {
use vcf::header::record::value::collection::Collection;

let mut glnexus_version: Option<String> = None;
let mut glnexus_config_name: Option<String> = None;

for (other, collection) in header.other_records() {
if ["GATKCommandLine", "DRAGENCommandLine"]
.iter()
.any(|k| other.as_ref().starts_with(k))
{
use vcf::header::record::value::collection::Collection;
if let Collection::Structured(map) = collection {
for (key, values) in map.iter() {
match (key.as_str(), values.other_fields().get("Version").cloned()) {
Expand All @@ -49,8 +64,24 @@ impl VariantCaller {
}
}
}
} else if other.as_ref().starts_with("GLnexusVersion") {
if let Collection::Unstructured(values) = collection {
glnexus_version = Some(values[0].clone());
}
} else if other.as_ref().starts_with("GLnexusConfigName") {
if let Collection::Unstructured(values) = collection {
glnexus_config_name = Some(values[0].clone());
}
}
}

if let Some(version) = glnexus_version {
return Some(VariantCaller::Glnexus {
version,
config_name: glnexus_config_name,
});
}

None
}
}
Expand Down Expand Up @@ -415,6 +446,23 @@ pub fn build_output_header(
.build()?,
),
)?,
VariantCaller::Glnexus {
version,
config_name,
} => builder.insert(
"x-varfish-version".parse()?,
vcf::header::record::Value::Map(
String::from("orig-caller"),
Map::<Other>::builder()
.insert("Name".parse()?, orig_caller.name())
.insert("Version".parse()?, version)
.insert(
"ConfigName".parse()?,
config_name.clone().unwrap_or_default(),
)
.build()?,
),
)?,
VariantCaller::Other => builder.insert(
"x-varfish-version".parse()?,
vcf::header::record::Value::Map(
Expand All @@ -432,11 +480,13 @@ pub fn build_output_header(
#[cfg(test)]
mod test {
use mehari::ped::PedigreeByName;
use noodles_vcf as vcf;
use rstest::rstest;

use super::VariantCaller;

#[rstest]
#[case("tests/seqvars/ingest/clair3_glnexus.vcf")]
#[case("tests/seqvars/ingest/example_dragen.07.021.624.3.10.4.vcf")]
#[case("tests/seqvars/ingest/example_dragen.07.021.624.3.10.9.vcf")]
#[case("tests/seqvars/ingest/example_gatk_hc.3.7-0.vcf")]
Expand All @@ -454,6 +504,7 @@ mod test {
}

#[rstest]
#[case("tests/seqvars/ingest/clair3_glnexus.vcf")]
#[case("tests/seqvars/ingest/example_dragen.07.021.624.3.10.4.vcf")]
#[case("tests/seqvars/ingest/example_dragen.07.021.624.3.10.9.vcf")]
#[case("tests/seqvars/ingest/example_gatk_hc.3.7-0.vcf")]
Expand All @@ -464,7 +515,7 @@ mod test {

let pedigree = PedigreeByName::from_path(path.replace(".vcf", ".ped")).unwrap();

let input_vcf_header = noodles_vcf::reader::Builder::default()
let mut input_vcf_header = noodles_vcf::reader::Builder::default()
.build_from_path(path)?
.read_header()?;
let output_vcf_header = super::build_output_header(
Expand All @@ -476,6 +527,12 @@ mod test {
"x.y.z",
)?;

// Work around glnexus issue with RNC.
if let Some(format) = input_vcf_header.formats_mut().get_mut("RNC") {
*format.number_mut() = vcf::header::Number::Count(1);
*format.type_mut() = vcf::header::record::value::map::format::Type::String;
}

let out_path = tmpdir.join("out.vcf");
let out_path_str = out_path.to_str().expect("invalid path");
{
Expand All @@ -489,6 +546,7 @@ mod test {
}

#[rstest]
#[case("tests/seqvars/ingest/clair3_glnexus.vcf")]
#[case("tests/seqvars/ingest/example_dragen.07.021.624.3.10.4.vcf")]
#[case("tests/seqvars/ingest/example_dragen.07.021.624.3.10.9.vcf")]
#[case("tests/seqvars/ingest/example_gatk_hc.3.7-0.vcf")]
Expand All @@ -499,7 +557,7 @@ mod test {

let pedigree = PedigreeByName::from_path(path.replace(".vcf", ".ped")).unwrap();

let input_vcf_header = noodles_vcf::reader::Builder::default()
let mut input_vcf_header = noodles_vcf::reader::Builder::default()
.build_from_path(path)?
.read_header()?;
let output_vcf_header = super::build_output_header(
Expand All @@ -511,6 +569,12 @@ mod test {
"x.y.z",
)?;

// Work around glnexus issue with RNC.
if let Some(format) = input_vcf_header.formats_mut().get_mut("RNC") {
*format.number_mut() = vcf::header::Number::Count(1);
*format.type_mut() = vcf::header::record::value::map::format::Type::String;
}

let out_path = tmpdir.join("out.vcf");
let out_path_str = out_path.to_str().expect("invalid path");
{
Expand Down
21 changes: 14 additions & 7 deletions src/seqvars/ingest/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -512,7 +512,7 @@ pub async fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), a
.map_err(|e| anyhow::anyhow!("could not build VCF reader: {}", e))?;

tracing::info!("processing header...");
let input_header = input_reader
let mut input_header = input_reader
.read_header()
.await
.map_err(|e| anyhow::anyhow!("problem reading VCF header: {}", e))?;
Expand All @@ -526,6 +526,12 @@ pub async fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), a
)
.map_err(|e| anyhow::anyhow!("problem building output header: {}", e))?;

// Work around glnexus issue with RNC.
if let Some(format) = input_header.formats_mut().get_mut("RNC") {
*format.number_mut() = vcf::header::Number::Count(1);
*format.type_mut() = vcf::header::record::value::map::format::Type::String;
}

// Use output file helper.
let out_path_helper = crate::common::s3::OutputPathHelper::new(&args.path_out)?;

Expand Down Expand Up @@ -566,12 +572,13 @@ mod test {
use crate::common::GenomeRelease;

#[rstest]
#[case("tests/seqvars/ingest/example_dragen.07.021.624.3.10.4.vcf")]
#[case("tests/seqvars/ingest/example_dragen.07.021.624.3.10.9.vcf")]
#[case("tests/seqvars/ingest/example_gatk_hc.3.7-0.vcf")]
#[case("tests/seqvars/ingest/example_gatk_hc.4.4.0.0.vcf")]
#[case("tests/seqvars/ingest/NA12878_dragen.vcf")]
#[case("tests/seqvars/ingest/Case_1.vcf")]
#[case::clair3_glnexus("tests/seqvars/ingest/clair3_glnexus.vcf")]
#[case::dragen_07_021_624_3_10_4("tests/seqvars/ingest/example_dragen.07.021.624.3.10.4.vcf")]
#[case::dragen_07_021_624_3_10_9("tests/seqvars/ingest/example_dragen.07.021.624.3.10.9.vcf")]
#[case::gatk_hc_3_7("tests/seqvars/ingest/example_gatk_hc.3.7-0.vcf")]
#[case::gatk_hc_4_4_0_0("tests/seqvars/ingest/example_gatk_hc.4.4.0.0.vcf")]
#[case::dragen_na12787("tests/seqvars/ingest/NA12878_dragen.vcf")]
#[case::gatk_hc_case_1("tests/seqvars/ingest/Case_1.vcf")]
#[tokio::test]
async fn result_snapshot_test(#[case] path: &str) -> Result<(), anyhow::Error> {
mehari::common::set_snapshot_suffix!(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
---
source: src/seqvars/ingest/header.rs
expression: "std::fs::read_to_string(out_path_str)?"
---
##fileformat=VCFv4.4
##INFO=<ID=gnomad_exomes_an,Number=1,Type=Integer,Description="Number of alleles in gnomAD exomes">
##INFO=<ID=gnomad_exomes_hom,Number=1,Type=Integer,Description="Number of hom. alt. carriers in gnomAD exomes">
##INFO=<ID=gnomad_exomes_het,Number=1,Type=Integer,Description="Number of het. alt. carriers in gnomAD exomes">
##INFO=<ID=gnomad_exomes_hemi,Number=1,Type=Integer,Description="Number of hemi. alt. carriers in gnomAD exomes">
##INFO=<ID=gnomad_genomes_an,Number=1,Type=Integer,Description="Number of alleles in gnomAD genomes">
##INFO=<ID=gnomad_genomes_hom,Number=1,Type=Integer,Description="Number of hom. alt. carriers in gnomAD genomes">
##INFO=<ID=gnomad_genomes_het,Number=1,Type=Integer,Description="Number of het. alt. carriers in gnomAD genomes">
##INFO=<ID=gnomad_genomes_hemi,Number=1,Type=Integer,Description="Number of hemi. alt. carriers in gnomAD genomes">
##INFO=<ID=helix_an,Number=1,Type=Integer,Description="Number of alleles in HelixMtDb">
##INFO=<ID=helix_hom,Number=1,Type=Integer,Description="Number of hom. alt. carriers in HelixMtDb">
##INFO=<ID=helix_het,Number=1,Type=Integer,Description="Number of het. alt. carriers in HelixMtDb">
##INFO=<ID=ANN,Number=.,Type=String,Description="Functional annotations: 'Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'">
##FILTER=<ID=PASS,Description="All filters passed">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Read depth for each allele">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Conditional genotype quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phase set">
##contig=<ID=1,length=249250621,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=2,length=243199373,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=3,length=198022430,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=4,length=191154276,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=5,length=180915260,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=6,length=171115067,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=7,length=159138663,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=8,length=146364022,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=9,length=141213431,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=10,length=135534747,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=11,length=135006516,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=12,length=133851895,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=13,length=115169878,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=14,length=107349540,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=15,length=102531392,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=16,length=90354753,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=17,length=81195210,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=18,length=78077248,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=19,length=59128983,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=20,length=63025520,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=21,length=48129895,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=22,length=51304566,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=X,length=155270560,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=Y,length=59373566,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=MT,length=16569,assembly="GRCh37",species="Homo sapiens">
##fileDate=20230421
##SAMPLE=<ID=SAMPLE1,Sex="Male",Disease="Affected">
##SAMPLE=<ID=SAMPLE2,Sex="Female",Disease="Affected">
##PEDIGREE=<ID=SAMPLE1>
##PEDIGREE=<ID=SAMPLE2>
##x-varfish-case-uuid=00000000-0000-0000-0000-000000000000
##x-varfish-version=<ID=varfish-server-worker,Version="x.y.z">
##x-varfish-version=<ID=orig-caller,Name="Glnexus",Version="v1.4.1-0-g68e25e5",ConfigName="/tmp/clair3.yml">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1 SAMPLE2
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
---
source: src/seqvars/ingest/header.rs
expression: "std::fs::read_to_string(out_path_str)?"
---
##fileformat=VCFv4.4
##INFO=<ID=gnomad_exomes_an,Number=1,Type=Integer,Description="Number of alleles in gnomAD exomes">
##INFO=<ID=gnomad_exomes_hom,Number=1,Type=Integer,Description="Number of hom. alt. carriers in gnomAD exomes">
##INFO=<ID=gnomad_exomes_het,Number=1,Type=Integer,Description="Number of het. alt. carriers in gnomAD exomes">
##INFO=<ID=gnomad_exomes_hemi,Number=1,Type=Integer,Description="Number of hemi. alt. carriers in gnomAD exomes">
##INFO=<ID=gnomad_genomes_an,Number=1,Type=Integer,Description="Number of alleles in gnomAD genomes">
##INFO=<ID=gnomad_genomes_hom,Number=1,Type=Integer,Description="Number of hom. alt. carriers in gnomAD genomes">
##INFO=<ID=gnomad_genomes_het,Number=1,Type=Integer,Description="Number of het. alt. carriers in gnomAD genomes">
##INFO=<ID=gnomad_genomes_hemi,Number=1,Type=Integer,Description="Number of hemi. alt. carriers in gnomAD genomes">
##INFO=<ID=helix_an,Number=1,Type=Integer,Description="Number of alleles in HelixMtDb">
##INFO=<ID=helix_hom,Number=1,Type=Integer,Description="Number of hom. alt. carriers in HelixMtDb">
##INFO=<ID=helix_het,Number=1,Type=Integer,Description="Number of het. alt. carriers in HelixMtDb">
##INFO=<ID=ANN,Number=.,Type=String,Description="Functional annotations: 'Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'">
##FILTER=<ID=PASS,Description="All filters passed">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Read depth for each allele">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Conditional genotype quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phase set">
##contig=<ID=chr1,length=248956422,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr2,length=242193529,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr3,length=198295559,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr4,length=190214555,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr5,length=181538259,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr6,length=170805979,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr7,length=159345973,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr8,length=145138636,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr9,length=138394717,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr10,length=133797422,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr11,length=135086622,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr12,length=133275309,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr13,length=114364328,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr14,length=107043718,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr15,length=101991189,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr16,length=90338345,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr17,length=83257441,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr18,length=80373285,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr19,length=58617616,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr20,length=64444167,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr21,length=46709983,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr22,length=50818468,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chrX,length=156040895,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chrY,length=57227415,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chrM,length=16569,assembly="GRCh38",species="Homo sapiens">
##fileDate=20230421
##SAMPLE=<ID=SAMPLE1,Sex="Male",Disease="Affected">
##SAMPLE=<ID=SAMPLE2,Sex="Female",Disease="Affected">
##PEDIGREE=<ID=SAMPLE1>
##PEDIGREE=<ID=SAMPLE2>
##x-varfish-case-uuid=00000000-0000-0000-0000-000000000000
##x-varfish-version=<ID=varfish-server-worker,Version="x.y.z">
##x-varfish-version=<ID=orig-caller,Name="Glnexus",Version="v1.4.1-0-g68e25e5",ConfigName="/tmp/clair3.yml">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1 SAMPLE2
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
source: src/seqvars/ingest/header.rs
expression: "VariantCaller::guess(&vcf_header)"
---
Glnexus:
version: v1.4.1-0-g68e25e5
config_name: /tmp/clair3.yml
Loading

0 comments on commit 4144003

Please sign in to comment.