From f5f18341ba4ea745b961902e169877bf0407f4a3 Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Thu, 22 Feb 2024 07:28:06 +0100 Subject: [PATCH] fix: work around issue with glnexus #362 --- misc/fix_glnexus.py | 64 ------------------- src/annotate/seqvars/mod.rs | 41 +++++++----- .../clair3-glnexus-min.multiallelic.vcf | 4 +- .../seqvars/clair3-glnexus-min.original.vcf | 3 - .../annotate/seqvars/clair3-glnexus-min.vcf | 4 +- 5 files changed, 28 insertions(+), 88 deletions(-) delete mode 100755 misc/fix_glnexus.py delete mode 100644 tests/data/annotate/seqvars/clair3-glnexus-min.original.vcf diff --git a/misc/fix_glnexus.py b/misc/fix_glnexus.py deleted file mode 100755 index f404ee85..00000000 --- a/misc/fix_glnexus.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -""" -Fix GLNexus output for input to `mehari annotate seqvars`. - -First, note that multiallelic sites must be corrected by a call to -`bcftools norm` already as an input to this script. - -What we will do is splitting the `FORMAT/RNC` field with a comma -as noodles-vcf expects a list of characters for rather than a -string of length 2 for `##FORMAT=`. -""" - -from pathlib import Path -import sys -from typing import Annotated - -import typer -import vcfpy - - -def main( - path_in: Annotated[Path, typer.Option(help="Path to input VCF")], - path_out: Annotated[Path, typer.Option(help="Path to output VCF")], - quiet: Annotated[bool, typer.Option(help="Disable verbose output")] = False, -): - """ - Fix GLNexus output to be compatible with `noodles-vcf`. - - cf: https://github.com/varfish-org/mehari/issues/356 - """ - if not quiet: - print(f"Opening input file {path_in}", file=sys.stderr) - reader = vcfpy.Reader.from_path(path_in) - header = reader.header.copy() - for line in header.lines: - if line.key == "FORMAT" and line.mapping.get("ID") == "GQ": - line.mapping["Number"] = 1 - line.mapping["Type"] = "Integer" - if not quiet: - print(f"Opening output file {path_out}", file=sys.stderr) - writer = vcfpy.Writer.from_path(path_out, header) - if not quiet: - print("Processing records...", file=sys.stderr) - with reader, writer: - for idx, record in enumerate(reader): - if idx % 10_000 == 0: - print( - f" at {idx} records {record.CHROM}:{record.POS}", file=sys.stderr - ) - for call in record.calls: - if "RNC" in call.data: - if ( - isinstance(call.data["RNC"], list) - and len(call.data["RNC"]) == 1 - ): - call.data["RNC"] = list(call.data["RNC"][0]) - writer.write_record(record) - if not quiet: - print("... done", file=sys.stderr) - print("All done. Have a nice day!", file=sys.stderr) - - -if __name__ == "__main__": - typer.run(main) diff --git a/src/annotate/seqvars/mod.rs b/src/annotate/seqvars/mod.rs index 89ffaebe..020e0bc2 100644 --- a/src/annotate/seqvars/mod.rs +++ b/src/annotate/seqvars/mod.rs @@ -11,6 +11,7 @@ use annonars::common::keys; use annonars::freqs::cli::import::reading::guess_assembly; use annonars::freqs::serialized::{auto, mt, xy}; use anyhow::anyhow; +use noodles_vcf::header::record::value::map::format::Type as FormatType; use noodles_vcf::record::genotypes::keys::key::{ self, CONDITIONAL_GENOTYPE_QUALITY, GENOTYPE, READ_DEPTH, READ_DEPTHS, }; @@ -31,7 +32,7 @@ use flate2::write::GzEncoder; use flate2::Compression; use noodles_bgzf::Writer as BgzfWriter; use noodles_vcf::header::{ - record::value::map::{info::Type, Info}, + record::value::map::{info::Type as InfoType, Info}, Number, }; use noodles_vcf::reader::Builder as VariantReaderBuilder; @@ -128,7 +129,7 @@ fn build_header(header_in: &VcfHeader) -> VcfHeader { field::Key::from_str("gnomad_exomes_an").unwrap(), Map::::new( Number::Count(1), - Type::Integer, + InfoType::Integer, "Number of alleles in gnomAD exomes", ), ); @@ -136,7 +137,7 @@ fn build_header(header_in: &VcfHeader) -> VcfHeader { field::Key::from_str("gnomad_exomes_hom").unwrap(), Map::::new( Number::Count(1), - Type::Integer, + InfoType::Integer, "Number of hom. alt. carriers in gnomAD exomes", ), ); @@ -144,7 +145,7 @@ fn build_header(header_in: &VcfHeader) -> VcfHeader { field::Key::from_str("gnomad_exomes_het").unwrap(), Map::::new( Number::Count(1), - Type::Integer, + InfoType::Integer, "Number of het. alt. carriers in gnomAD exomes", ), ); @@ -152,7 +153,7 @@ fn build_header(header_in: &VcfHeader) -> VcfHeader { field::Key::from_str("gnomad_exomes_hemi").unwrap(), Map::::new( Number::Count(1), - Type::Integer, + InfoType::Integer, "Number of hemi. alt. carriers in gnomAD exomes", ), ); @@ -161,7 +162,7 @@ fn build_header(header_in: &VcfHeader) -> VcfHeader { field::Key::from_str("gnomad_genomes_an").unwrap(), Map::::new( Number::Count(1), - Type::Integer, + InfoType::Integer, "Number of alleles in gnomAD genomes", ), ); @@ -169,7 +170,7 @@ fn build_header(header_in: &VcfHeader) -> VcfHeader { field::Key::from_str("gnomad_genomes_hom").unwrap(), Map::::new( Number::Count(1), - Type::Integer, + InfoType::Integer, "Number of hom. alt. carriers in gnomAD genomes", ), ); @@ -177,7 +178,7 @@ fn build_header(header_in: &VcfHeader) -> VcfHeader { field::Key::from_str("gnomad_genomes_het").unwrap(), Map::::new( Number::Count(1), - Type::Integer, + InfoType::Integer, "Number of het. alt. carriers in gnomAD genomes", ), ); @@ -185,7 +186,7 @@ fn build_header(header_in: &VcfHeader) -> VcfHeader { field::Key::from_str("gnomad_genomes_hemi").unwrap(), Map::::new( Number::Count(1), - Type::Integer, + InfoType::Integer, "Number of hemi. alt. carriers in gnomAD genomes", ), ); @@ -194,7 +195,7 @@ fn build_header(header_in: &VcfHeader) -> VcfHeader { field::Key::from_str("helix_an").unwrap(), Map::::new( Number::Count(1), - Type::Integer, + InfoType::Integer, "Number of alleles in HelixMtDb", ), ); @@ -202,7 +203,7 @@ fn build_header(header_in: &VcfHeader) -> VcfHeader { field::Key::from_str("helix_hom").unwrap(), Map::::new( Number::Count(1), - Type::Integer, + InfoType::Integer, "Number of hom. alt. carriers in HelixMtDb", ), ); @@ -210,7 +211,7 @@ fn build_header(header_in: &VcfHeader) -> VcfHeader { field::Key::from_str("helix_het").unwrap(), Map::::new( Number::Count(1), - Type::Integer, + InfoType::Integer, "Number of het. alt. carriers in HelixMtDb", ), ); @@ -219,7 +220,7 @@ fn build_header(header_in: &VcfHeader) -> VcfHeader { field::Key::from_str("ANN").unwrap(), Map::::new( Number::Unknown, - Type::String, + InfoType::String, "Functional annotations: 'Allele | Annotation | Annotation_Impact | Gene_Name | \ Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | \ cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | \ @@ -231,7 +232,7 @@ fn build_header(header_in: &VcfHeader) -> VcfHeader { field::Key::from_str("clinvar_clinsig").unwrap(), Map::::new( Number::Count(1), - Type::String, + InfoType::String, "ClinVar clinical significance (one highest significance per VCV)", ), ); @@ -239,13 +240,13 @@ fn build_header(header_in: &VcfHeader) -> VcfHeader { field::Key::from_str("clinvar_rcv").unwrap(), Map::::new( Number::Count(1), - Type::String, + InfoType::String, "ClinVar RCV accession (corresponds to clinvar_clinsig)", ), ); header_out.infos_mut().insert( field::Key::from_str("clinvar_vcv").unwrap(), - Map::::new(Number::Count(1), Type::String, "ClinVar VCV accession"), + Map::::new(Number::Count(1), InfoType::String, "ClinVar VCV accession"), ); header_out @@ -1416,9 +1417,15 @@ impl AnnotatedVcfWriter for VarFishSeqvarTsvWriter { fn run_with_writer(writer: &mut dyn AnnotatedVcfWriter, args: &Args) -> Result<(), anyhow::Error> { tracing::info!("Open VCF and read header"); let mut reader = VariantReaderBuilder::default().build_from_path(&args.path_input_vcf)?; - let header_in = reader.read_header()?; + let mut header_in = reader.read_header()?; let header_out = build_header(&header_in); + // Work around glnexus issue with RNC. + if let Some(format) = header_in.formats_mut().get_mut("RNC") { + *format.number_mut() = Number::Count(1); + *format.type_mut() = FormatType::String; + } + // Guess genome release from contigs in VCF header. let genome_release = args.genome_release.map(|gr| match gr { GenomeRelease::Grch37 => Assembly::Grch37p10, // has chrMT! diff --git a/tests/data/annotate/seqvars/clair3-glnexus-min.multiallelic.vcf b/tests/data/annotate/seqvars/clair3-glnexus-min.multiallelic.vcf index b70381c4..96e0439b 100644 --- a/tests/data/annotate/seqvars/clair3-glnexus-min.multiallelic.vcf +++ b/tests/data/annotate/seqvars/clair3-glnexus-min.multiallelic.vcf @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:acdcc6065fcab6156f630b1fe27afb9cedd09adac0f84215a879120ae84a3162 -size 5548 +oid sha256:34375a648888f2d0d71e2396c3feb101c29205444920cd20f59c8b174e7de514 +size 5531 diff --git a/tests/data/annotate/seqvars/clair3-glnexus-min.original.vcf b/tests/data/annotate/seqvars/clair3-glnexus-min.original.vcf deleted file mode 100644 index 41e5f0e4..00000000 --- a/tests/data/annotate/seqvars/clair3-glnexus-min.original.vcf +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f8adde9396d07ce7379922e57f7c9ca6e0b96ea7b4d1203075f3cb2d017c03f1 -size 5436 diff --git a/tests/data/annotate/seqvars/clair3-glnexus-min.vcf b/tests/data/annotate/seqvars/clair3-glnexus-min.vcf index 74db4010..32943db0 100644 --- a/tests/data/annotate/seqvars/clair3-glnexus-min.vcf +++ b/tests/data/annotate/seqvars/clair3-glnexus-min.vcf @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b7ae1ba6ba0989506e92fbf1fef393fc4eb7e66cba2add3a8dbfb540a7571fbe -size 5791 +oid sha256:47cf50a4687fb5a82ac413119c97a73ef17afa635ea243ef35a21454c3f105ae +size 5769