Skip to content

Commit

Permalink
feat: implement ingest for Clair3+GLNexus (#297)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe committed Feb 27, 2024
1 parent 43413d4 commit cdb6064
Show file tree
Hide file tree
Showing 5 changed files with 191 additions and 11 deletions.
56 changes: 52 additions & 4 deletions src/seqvars/ingest/header.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,19 @@ use crate::common::GenomeRelease;
/// Enumeration for the known variant callers.
#[derive(Debug, Clone, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
pub enum VariantCaller {
GatkHaplotypeCaller { version: String },
GatkUnifiedGenotyper { version: String },
Dragen { version: String },
GatkHaplotypeCaller {
version: String,
},
GatkUnifiedGenotyper {
version: String,
},
Glnexus {
version: String,
config_name: Option<String>,
},
Dragen {
version: String,
},
Other,
}

Expand All @@ -20,19 +30,24 @@ impl VariantCaller {
VariantCaller::GatkHaplotypeCaller { .. } => "GatkHaplotypeCaller",
VariantCaller::GatkUnifiedGenotyper { .. } => "GatkUnifiedGenotyper",
VariantCaller::Dragen { .. } => "Dragen",
VariantCaller::Glnexus { .. } => "Glnexus",
VariantCaller::Other => "Other",
}
}
}

impl VariantCaller {
pub fn guess(header: &vcf::Header) -> Option<Self> {
use vcf::header::record::value::collection::Collection;

let mut glnexus_version: Option<String> = None;
let mut glnexus_config_name: Option<String> = None;

for (other, collection) in header.other_records() {
if ["GATKCommandLine", "DRAGENCommandLine"]
.iter()
.any(|k| other.as_ref().starts_with(k))
{
use vcf::header::record::value::collection::Collection;
if let Collection::Structured(map) = collection {
for (key, values) in map.iter() {
match (key.as_str(), values.other_fields().get("Version").cloned()) {
Expand All @@ -49,8 +64,24 @@ impl VariantCaller {
}
}
}
} else if other.as_ref().starts_with("GLnexusVersion") {
if let Collection::Unstructured(values) = collection {
glnexus_version = Some(values[0].clone());
}
} else if other.as_ref().starts_with("GLnexusConfigName") {
if let Collection::Unstructured(values) = collection {
glnexus_config_name = Some(values[0].clone());
}
}
}

if let Some(version) = glnexus_version {
return Some(VariantCaller::Glnexus {
version,
config_name: glnexus_config_name,
});
}

None
}
}
Expand Down Expand Up @@ -415,6 +446,23 @@ pub fn build_output_header(
.build()?,
),
)?,
VariantCaller::Glnexus {
version,
config_name,
} => builder.insert(
"x-varfish-version".parse()?,
vcf::header::record::Value::Map(
String::from("orig-caller"),
Map::<Other>::builder()
.insert("Name".parse()?, orig_caller.name())
.insert("Version".parse()?, version)
.insert(
"ConfigName".parse()?,
config_name.clone().unwrap_or_default(),
)
.build()?,
),
)?,
VariantCaller::Other => builder.insert(
"x-varfish-version".parse()?,
vcf::header::record::Value::Map(
Expand Down
21 changes: 14 additions & 7 deletions src/seqvars/ingest/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -512,7 +512,7 @@ pub async fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), a
.map_err(|e| anyhow::anyhow!("could not build VCF reader: {}", e))?;

tracing::info!("processing header...");
let input_header = input_reader
let mut input_header = input_reader
.read_header()
.await
.map_err(|e| anyhow::anyhow!("problem reading VCF header: {}", e))?;
Expand All @@ -526,6 +526,12 @@ pub async fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), a
)
.map_err(|e| anyhow::anyhow!("problem building output header: {}", e))?;

// Work around glnexus issue with RNC.
if let Some(format) = input_header.formats_mut().get_mut("RNC") {
*format.number_mut() = vcf::header::Number::Count(1);
*format.type_mut() = vcf::header::record::value::map::format::Type::String;
}

// Use output file helper.
let out_path_helper = crate::common::s3::OutputPathHelper::new(&args.path_out)?;

Expand Down Expand Up @@ -566,12 +572,13 @@ mod test {
use crate::common::GenomeRelease;

#[rstest]
#[case("tests/seqvars/ingest/example_dragen.07.021.624.3.10.4.vcf")]
#[case("tests/seqvars/ingest/example_dragen.07.021.624.3.10.9.vcf")]
#[case("tests/seqvars/ingest/example_gatk_hc.3.7-0.vcf")]
#[case("tests/seqvars/ingest/example_gatk_hc.4.4.0.0.vcf")]
#[case("tests/seqvars/ingest/NA12878_dragen.vcf")]
#[case("tests/seqvars/ingest/Case_1.vcf")]
#[case::clair3_glnexus("tests/seqvars/ingest/clair3_glnexus.vcf")]
#[case::dragen_07_021_624_3_10_4("tests/seqvars/ingest/example_dragen.07.021.624.3.10.4.vcf")]
#[case::dragen_07_021_624_3_10_9("tests/seqvars/ingest/example_dragen.07.021.624.3.10.9.vcf")]
#[case::gatk_hc_3_7("tests/seqvars/ingest/example_gatk_hc.3.7-0.vcf")]
#[case::gatk_hc_4_4_0_0("tests/seqvars/ingest/example_gatk_hc.4.4.0.0.vcf")]
#[case::dragen_na12787("tests/seqvars/ingest/NA12878_dragen.vcf")]
#[case::gatk_hc_case_1("tests/seqvars/ingest/Case_1.vcf")]
#[tokio::test]
async fn result_snapshot_test(#[case] path: &str) -> Result<(), anyhow::Error> {
mehari::common::set_snapshot_suffix!(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
---
source: src/seqvars/ingest/mod.rs
expression: "std::fs::read_to_string(&args.path_out)?"
---
##fileformat=VCFv4.4
##INFO=<ID=gnomad_exomes_an,Number=1,Type=Integer,Description="Number of alleles in gnomAD exomes">
##INFO=<ID=gnomad_exomes_hom,Number=1,Type=Integer,Description="Number of hom. alt. carriers in gnomAD exomes">
##INFO=<ID=gnomad_exomes_het,Number=1,Type=Integer,Description="Number of het. alt. carriers in gnomAD exomes">
##INFO=<ID=gnomad_exomes_hemi,Number=1,Type=Integer,Description="Number of hemi. alt. carriers in gnomAD exomes">
##INFO=<ID=gnomad_genomes_an,Number=1,Type=Integer,Description="Number of alleles in gnomAD genomes">
##INFO=<ID=gnomad_genomes_hom,Number=1,Type=Integer,Description="Number of hom. alt. carriers in gnomAD genomes">
##INFO=<ID=gnomad_genomes_het,Number=1,Type=Integer,Description="Number of het. alt. carriers in gnomAD genomes">
##INFO=<ID=gnomad_genomes_hemi,Number=1,Type=Integer,Description="Number of hemi. alt. carriers in gnomAD genomes">
##INFO=<ID=helix_an,Number=1,Type=Integer,Description="Number of alleles in HelixMtDb">
##INFO=<ID=helix_hom,Number=1,Type=Integer,Description="Number of hom. alt. carriers in HelixMtDb">
##INFO=<ID=helix_het,Number=1,Type=Integer,Description="Number of het. alt. carriers in HelixMtDb">
##INFO=<ID=ANN,Number=.,Type=String,Description="Functional annotations: 'Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'">
##FILTER=<ID=PASS,Description="All filters passed">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Read depth for each allele">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Conditional genotype quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phase set">
##contig=<ID=1,length=249250621,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=2,length=243199373,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=3,length=198022430,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=4,length=191154276,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=5,length=180915260,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=6,length=171115067,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=7,length=159138663,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=8,length=146364022,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=9,length=141213431,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=10,length=135534747,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=11,length=135006516,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=12,length=133851895,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=13,length=115169878,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=14,length=107349540,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=15,length=102531392,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=16,length=90354753,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=17,length=81195210,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=18,length=78077248,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=19,length=59128983,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=20,length=63025520,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=21,length=48129895,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=22,length=51304566,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=X,length=155270560,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=Y,length=59373566,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=MT,length=16569,assembly="GRCh37",species="Homo sapiens">
##fileDate=20230421
##SAMPLE=<ID=SAMPLE1,Sex="Male",Disease="Affected">
##SAMPLE=<ID=SAMPLE2,Sex="Female",Disease="Affected">
##PEDIGREE=<ID=SAMPLE1>
##PEDIGREE=<ID=SAMPLE2>
##x-varfish-case-uuid=00000000-0000-0000-0000-000000000000
##x-varfish-version=<ID=varfish-server-worker,Version="x.y.z">
##x-varfish-version=<ID=orig-caller,Name="Glnexus",Version="v1.4.1-0-g68e25e5",ConfigName="/tmp/clair3.yml">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1 SAMPLE2
1 10108 . C CT . . . GT:DP:AD:GQ ./.:17:17,0:0 1/1:15:1,14:1
1 10109 . A T . . . GT:DP:AD:GQ 0/1:11:8,3:9 0/0:15:15,0:44
1 10147 . C CCCCTAA . . . GT:DP:AD:GQ 0/1:11:8,3:5 0/1:15:4,11:4
1 10206 . A AC . . . GT:DP:AD:GQ ./.:15:15,0:0 0/1:15:4,11:2
1 10212 . A AC . . . GT:DP:AD:GQ ./.:15:15,0:0 0/1:15:4,11:1
1 10218 . A AC . . . GT:DP:AD:GQ ./.:13:13,0:0 0/1:15:4,11:0
1 10230 . AC ACCCTAA . . . GT:DP:AD:GQ 1/0:11:8,3:5 0/1:15:4,11:2
1 10230 . AC A . . . GT:DP:AD:GQ 0/1:11:5,6:5 0/0:15:15,0:2
1 10254 . T C . . . GT:DP:AD:GQ 0/1:11:9,2:5 1/1:15:4,11:2
1 10257 . A C . . . GT:DP:AD:GQ 0/1:11:7,4:4 1/1:15:4,11:2
1 10291 . C T . . . GT:DP:AD:GQ 0/1:11:3,8:7 0/1:15:2,13:6
2 changes: 2 additions & 0 deletions tests/seqvars/ingest/clair3_glnexus.ped
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
FAM SAMPLE1 0 0 1 2
FAM SAMPLE2 0 0 2 2
55 changes: 55 additions & 0 deletions tests/seqvars/ingest/clair3_glnexus.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
##fileformat=VCFv4.2
##NOTE=We split the RNC values with a comma where originally they are a 2-character string.
##FILTER=<ID=PASS,Description="All filters passed">
##GLnexusVersion=v1.4.1-0-g68e25e5
##GLnexusConfigName=/tmp/clair3.yml
##GLnexusConfigCRC32C=496041348
##GLnexusConfig={unifier_config: {drop_filtered: false, min_allele_copy_number: 1, min_AQ1: 10, min_AQ2: 10, min_GQ: 2, max_alleles_per_site: 32, monoallelic_sites_for_lost_alleles: true, preference: common}, genotyper_config: {revise_genotypes: true, min_assumed_allele_frequency: 9.99999975e-05, snv_prior_calibration: 1, indel_prior_calibration: 1, required_dp: 1, allow_partial_data: true, allele_dp_format: AD, ref_dp_format: MIN_DP, output_residuals: false, more_PL: true, squeeze: false, trim_uncalled_alleles: true, top_two_half_calls: false, output_format: BCF, liftover_fields: [{orig_names: [MIN_DP, DP], name: DP, description: "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Approximate read depth (reads with MQ=255 or with bad mates are filtered)\">", type: int, number: basic, default_type: missing, count: 1, combi_method: min, ignore_non_variants: true}, {orig_names: [AD], name: AD, description: "##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths for the ref and alt alleles in the order listed\">", type: int, number: alleles, default_type: zero, count: 0, combi_method: min, ignore_non_variants: false}, {orig_names: [GQ], name: GQ, description: "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">", type: int, number: basic, default_type: missing, count: 1, combi_method: min, ignore_non_variants: true}, {orig_names: [PL], name: PL, description: "##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"Phred-scaled genotype Likelihoods\">", type: int, number: genotype, default_type: missing, count: 0, combi_method: missing, ignore_non_variants: true}]}}
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency estimate for each alternate allele">
##INFO=<ID=AQ,Number=A,Type=Integer,Description="Allele Quality score reflecting evidence for each alternate allele (Phred scale)">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##FILTER=<ID=MONOALLELIC,Description="Site represents one ALT allele in a region with multiple variants that could not be unified into non-overlapping multi-allelic sites">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=RNC,Number=2,Type=Character,Description="Reason for No Call in GT: . = n/a, M = Missing data, P = Partial data, I = gVCF input site is non-called, D = insufficient Depth of coverage, - = unrepresentable overlapping deletion, L = Lost/unrepresentable allele (other than deletion), U = multiple Unphased variants present, O = multiple Overlapping variants present, 1 = site is Monoallelic, no assertion about presence of REF or ALT allele">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Phred-scaled genotype Likelihoods">
##contig=<ID=1,length=249250621>
##contig=<ID=2,length=243199373>
##contig=<ID=3,length=198022430>
##contig=<ID=4,length=191154276>
##contig=<ID=5,length=180915260>
##contig=<ID=6,length=171115067>
##contig=<ID=7,length=159138663>
##contig=<ID=8,length=146364022>
##contig=<ID=9,length=141213431>
##contig=<ID=10,length=135534747>
##contig=<ID=11,length=135006516>
##contig=<ID=12,length=133851895>
##contig=<ID=13,length=115169878>
##contig=<ID=14,length=107349540>
##contig=<ID=15,length=102531392>
##contig=<ID=16,length=90354753>
##contig=<ID=17,length=81195210>
##contig=<ID=18,length=78077248>
##contig=<ID=19,length=59128983>
##contig=<ID=20,length=63025520>
##contig=<ID=21,length=48129895>
##contig=<ID=22,length=51304566>
##contig=<ID=X,length=155270560>
##contig=<ID=Y,length=59373566>
##bcftools_viewVersion=1.19+htslib-1.19.1
##bcftools_viewCommand=view test.bcf; Date=Thu Feb 8 15:26:01 2024
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1 SAMPLE2
1 10108 1_10108_C_CT C CT 16 . AF=0.5;AQ=16 GT:DP:AD:GQ:PL:RNC ./.:17:17,0:0:128,0,278:II 1/1:15:1,14:1:16,4,0:..
1 10109 1_10109_A_T A T 10 . AF=0.25;AQ=10 GT:DP:AD:GQ:PL:RNC 0/1:11:7,3:9:10,0,25:.. 0/0:15:15,0:44:0,45,449:..
1 10147 1_10147_C_CCCCTAA C CCCCTAA 24 . AF=0.5;AQ=24 GT:DP:AD:GQ:PL:RNC 0/1:11:3,3:5:16,0,15:.. 0/1:15:1,11:4:24,0,10:..
1 10206 1_10206_A_AC A AC 11 . AF=0.25;AQ=11 GT:DP:AD:GQ:PL:RNC ./.:15:15,0:0:104,0,254:I,I 0/1:15:4,11:2:11,0,8:..
1 10212 1_10212_A_AC A AC 10 . AF=0.25;AQ=10 GT:DP:AD:GQ:PL:RNC ./.:15:15,0:0:104,0,254:I,I 0/1:15:3,11:1:10,0,5:..
1 10218 1_10218_A_AC A AC 10 . AF=0.25;AQ=10 GT:DP:AD:GQ:PL:RNC ./.:13:13,0:0:50,0,260:I,I 0/1:15:2,11:0:10,0,5:..
1 10230 1_10231_C_CCCTAA;1_10230_AC_A AC ACCCTAA,A 23 . AF=0.5,0.25;AQ=19,23 GT:DP:AD:GQ:PL:RNC 1/2:11:.,3,6:5:0,0,0,0,0,0:.. 0/1:15:2,11,0:2:19,0,8,990,990,990:..
1 10254 1_10254_T_C T C 26 . AF=0.75;AQ=26 GT:DP:AD:GQ:PL:RNC 0/1:11:6,2:5:5,0,23:.. 1/1:15:2,11:2:26,3,0:..
1 10257 1_10257_A_C A C 29 . AF=0.75;AQ=29 GT:DP:AD:GQ:PL:RNC 0/1:11:6,4:4:7,0,7:.. 1/1:15:3,11:2:29,3,0:..
1 10291 1_10291_C_T C T 27 . AF=0.5;AQ=27 GT:DP:AD:GQ:PL:RNC 0/1:11:1,8:7:20,0,7:.. 0/1:15:1,13:6:27,0,3:..

0 comments on commit cdb6064

Please sign in to comment.