Skip to content

Commit

Permalink
Support lenient (optimistic) read-only VCF 4.4.
Browse files Browse the repository at this point in the history
  • Loading branch information
cmnbroad committed Sep 5, 2023
1 parent 38dbd0e commit 8fa5da7
Show file tree
Hide file tree
Showing 7 changed files with 101 additions and 11 deletions.
15 changes: 12 additions & 3 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -129,17 +129,26 @@ task testWithDefaultReference(type: Test) {
}
}

task testWithOptimisticVCF4_4(type: Test) {
description = "Run tests with optimistic VCF 4.4 reading"
jvmArgs += '-Dsamjdk.optimistic_vcf_4_4=true'

useTestNG {
includeGroups "optimistic_vcf_4_4"
}
}

test {
description = "Runs the unit tests other than the SRA tests"

useTestNG {
if (OperatingSystem.current().isUnix()) {
excludeGroups "slow", "broken", "defaultReference", "ftp", "http", "sra", "ena"
excludeGroups "slow", "broken", "defaultReference", "optimistic_vcf_4_4", "ftp", "http", "sra", "ena"
} else {
excludeGroups "slow", "broken", "defaultReference", "ftp", "http", "sra", "ena", "unix"
excludeGroups "slow", "broken", "defaultReference", "optimistic_vcf_4_4", "ftp", "http", "sra", "ena", "unix"
}
}
} dependsOn testWithDefaultReference
} dependsOn testWithDefaultReference, testWithOptimisticVCF4_4


task testFTP(type: Test) {
Expand Down
8 changes: 7 additions & 1 deletion src/main/java/htsjdk/samtools/Defaults.java
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ public class Defaults {
*/
public static final String DEFAULT_VCF_EXTENSION;

// accept VCF 4.4 files for read, but treat them as VCF 4.3
public static final boolean OPTIMISTIC_VCF_4_4;

/**
* Even if BUFFER_SIZE is 0, this is guaranteed to be non-zero. If BUFFER_SIZE is non-zero,
* this == BUFFER_SIZE (Default = 128k).
Expand Down Expand Up @@ -105,12 +108,14 @@ public class Defaults {
*/
public static final String DISABLE_SNAPPY_PROPERTY_NAME = "snappy.disable";

public static final String OPTIMISTIC_VCF_4_4_PROPERTY = "optimistic_vcf_4_4";


/**
* Disable use of the Snappy compressor. Default = false.
*/
public static final boolean DISABLE_SNAPPY_COMPRESSOR;


public static final String SAMJDK_PREFIX = "samjdk.";
static {
CREATE_INDEX = getBooleanProperty("create_index", false);
Expand All @@ -134,6 +139,7 @@ public class Defaults {
SAM_FLAG_FIELD_FORMAT = SamFlagField.valueOf(getStringProperty("sam_flag_field_format", SamFlagField.DECIMAL.name()));
SRA_LIBRARIES_DOWNLOAD = getBooleanProperty("sra_libraries_download", false);
DISABLE_SNAPPY_COMPRESSOR = getBooleanProperty(DISABLE_SNAPPY_PROPERTY_NAME, false);
OPTIMISTIC_VCF_4_4 = getBooleanProperty(OPTIMISTIC_VCF_4_4_PROPERTY, false);
}

/**
Expand Down
12 changes: 9 additions & 3 deletions src/main/java/htsjdk/variant/vcf/VCFCodec.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@

package htsjdk.variant.vcf;

import htsjdk.samtools.Defaults;
import htsjdk.samtools.util.Log;
import htsjdk.tribble.TribbleException;
import htsjdk.tribble.readers.LineIterator;

Expand Down Expand Up @@ -72,6 +74,8 @@
* @since 2010
*/
public class VCFCodec extends AbstractVCFCodec {
private final static Log log = Log.getInstance(VCFCodec.class);

// Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters.
public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4";

Expand All @@ -96,9 +100,11 @@ public Object readActualHeader(final LineIterator lineIterator) {
throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version");
foundHeaderVersion = true;
version = VCFHeaderVersion.toHeaderVersion(lineFields[1]);
if ( ! version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) )
throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4; please use the VCF3 codec for " + lineFields[1]);
if ( version != VCFHeaderVersion.VCF4_0 && version != VCFHeaderVersion.VCF4_1 && version != VCFHeaderVersion.VCF4_2 && version != VCFHeaderVersion.VCF4_3)
if (Defaults.OPTIMISTIC_VCF_4_4 == true && version == VCFHeaderVersion.VCF4_4 ) {
// if optimistic VCFv4.4 is enabled, accept VCFv4.4 as input, but treat it as VCFv4.3, and hope for the best
log.warn("********** VCFv4.4 is not yet fully supported - processing VCFv4.4 input as VCFv4.3! **********");
version = VCFHeaderVersion.VCF4_3;
} else if ( version != VCFHeaderVersion.VCF4_0 && version != VCFHeaderVersion.VCF4_1 && version != VCFHeaderVersion.VCF4_2 && version != VCFHeaderVersion.VCF4_3)
throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4 and does not support " + lineFields[1]);
}
headerStrings.add(lineIterator.next());
Expand Down
5 changes: 4 additions & 1 deletion src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,10 @@ public enum VCFHeaderVersion {
VCF4_0("VCFv4.0", "fileformat"),
VCF4_1("VCFv4.1", "fileformat"),
VCF4_2("VCFv4.2", "fileformat"),
VCF4_3("VCFv4.3", "fileformat");
VCF4_3("VCFv4.3", "fileformat"),
// VCFv4.4 is not fully supported yet, but we need this to support optimistic reading when
// the "optimistic_vcf_4_4" property is set
VCF4_4("VCFv4.4", "fileformat");

private final String versionString;
private final String formatString;
Expand Down
13 changes: 12 additions & 1 deletion src/test/java/htsjdk/tribble/AbstractFeatureReaderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@
import htsjdk.tribble.readers.LineIterator;
import htsjdk.variant.VariantBaseTest;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFCodec;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderVersion;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
Expand Down Expand Up @@ -97,7 +100,15 @@ public void testBlockCompressionExtension(final String testURIString, final bool
public void testBlockCompressionExtensionStringVersion(final String testURIString, final boolean expected) {
Assert.assertEquals(AbstractFeatureReader.hasBlockCompressedExtension(testURIString), expected);
}

@Test(groups = "optimistic_vcf_4_4")
public void testVCF4_4Optimistic() {
final AbstractFeatureReader<VariantContext, ?> fr = AbstractFeatureReader.getFeatureReader(
Paths.get("src/test/resources/htsjdk/variant/", "VCF4_4HeaderTest.vcf").toString(),
new VCFCodec(),
false);
final VCFHeader vcfHeader = (VCFHeader) fr.getHeader();
Assert.assertEquals(vcfHeader.getVCFHeaderVersion(), VCFHeaderVersion.VCF4_3);
}

@DataProvider(name = "vcfFileAndWrapperCombinations")
private static Object[][] vcfFileAndWrapperCombinations(){
Expand Down
17 changes: 15 additions & 2 deletions src/test/java/htsjdk/variant/vcf/VCFFileReaderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@
import com.google.common.jimfs.Configuration;
import com.google.common.jimfs.Jimfs;
import htsjdk.HtsjdkTest;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.seekablestream.SeekableStreamFactory;
import htsjdk.samtools.util.IOUtil;
import htsjdk.tribble.TestUtils;
import htsjdk.tribble.TribbleException;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
Expand Down Expand Up @@ -50,6 +49,10 @@ Object[][] pathsData() {
// various ways to refer to a local file
{TEST_DATA_DIR + "VCF4HeaderTest.vcf", null, false, true},

// this file is the same as VCF4HeaderTest.vcf, except the header is marked as VCF 4.4
// this fails unless the "optimistic_vcf_4_4" property is set, so it's expected to fail here
{TEST_DATA_DIR + "VCF4_4HeaderTest.vcf", null, false, false},

// // this is almost a vcf, but not quite it's missing the #CHROM line and it has no content...
{TEST_DATA_DIR + "Homo_sapiens_assembly38.tile_db_header.vcf", null, false, false},

Expand Down Expand Up @@ -102,6 +105,16 @@ public void testCanOpenVCFPathReader(final String file, final String index, fina
Assert.assertTrue(shouldSucceed, "Test should have failed but succeeded");
}

@Test(groups = "optimistic_vcf_4_4")
public void testAcceptOptimisticVCF4_4() {
// This file is the same as VCF4HeaderTest.vcf, except the header is marked as VCF 4.4
// This will fail unless the optimistic_vcf_4_4" property isn't set
try (final VCFFileReader reader = new VCFFileReader(Paths.get(TEST_DATA_DIR.getAbsolutePath(), "VCF4_4HeaderTest.vcf"), false)) {
final VCFHeader header = reader.getFileHeader();
Assert.assertEquals(header.getVCFHeaderVersion(), VCFHeaderVersion.VCF4_3);
}
}

@Test
public void testTabixFileWithEmbeddedSpaces() throws IOException {
final File testVCF = new File(TEST_DATA_DIR, "HiSeq.10000.vcf.bgz");
Expand Down
42 changes: 42 additions & 0 deletions src/test/resources/htsjdk/variant/VCF4_4HeaderTest.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
##fileformat=VCFv4.4
##FILTER=<ID=ABFilter,Description="AB 0.75 && DP 40">
##FILTER=<ID=DPFilter,Description="DP 120 || SB -0.10">
##FILTER=<ID=FDRtranche0.00to0.10,Description="FDR tranche level at qual 0.06">
##FILTER=<ID=FDRtranche0.10to1.00,Description="FDR tranche level at qual 0.03">
##FILTER=<ID=FDRtranche1.00to2.00,Description="FDR tranche level at qual 0.02">
##FILTER=<ID=FDRtranche2.00to10.00+,Description="FDR tranche level at qual > 0.06">
##FILTER=<ID=FDRtranche2.00to10.00,Description="FDR tranche level at qual unknown">
##FILTER=<ID=HARD_TO_VALIDATE,Description="MQ0 = 4 && ((MQ0 / (1.0 * DP)) 0.1)">
##FILTER=<ID=Indel,Description="Overlaps a user-input mask">
##FILTER=<ID=LowQual,Description="Low quality">
##FILTER=<ID=LowQual,Description="QUAL 50.0">
##FILTER=<ID=ANNOTATION,Description="ANNOTATION != \"NA\" || ANNOTATION <= 0.01">
##FILTER=<ID=ANNOTATION2,Description="ANNOTATION with quote \" that is unmatched but escaped">
##FILTER=<ID=SnpCluster,Description="SNPs found in clusters">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth (only filtered reads used for calling)">
##FORMAT=<ID=GL,Number=3,Type=Float,Description="Log-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic">
##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##INFO=<ID=AC,Number=.,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP Membership">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##INFO=<ID=Dels,Number=1,Type=Float,Description="Fraction of Reads Containing Spanning Deletions">
##INFO=<ID=HRun,Number=1,Type=Integer,Description="Largest Contiguous Homopolymer Run of Variant Allele In Either Direction">
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with two (and only two) segregating haplotypes">
##INFO=<ID=EscapingQuote,Number=1,Type=Float,Description="This description has an escaped \" quote in it">
##INFO=<ID=EscapingBackslash,Number=1,Type=Float,Description="This description has an escaped \\ backslash in it">
##INFO=<ID=EscapingNonQuoteOrBackslash,Number=1,Type=Float,Description="This other value has a \n newline in it">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
##INFO=<ID=OQ,Number=1,Type=Float,Description="The original variant quality score">
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
##INFO=<ID=SB,Number=1,Type=Float,Description="Strand Bias">
##UnifiedGenotyper="analysis_type=UnifiedGenotyper input_file=[/humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-23/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-24/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-5/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-9/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-6/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-19/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-25/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-4/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-14/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-22/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-2/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-3/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-7/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-16/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-1/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-17/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-8/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-10/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-18/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-20/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-11/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-15/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-21/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-12/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-13/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam] read_buffer_size=null read_filter=[] intervals=[chrX] excludeIntervals=[chrM, chrY] reference_sequence=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta rodBind=[dbsnp,dbsnp,/humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod, interval,Intervals,chrX] rodToIntervalTrackName=null BTI_merge_rule=UNION DBSNP=/humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod hapmap=null hapmap_chip=null out=null err=null outerr=null filterZeroMappingQualityReads=false downsampling_type=NONE downsample_to_fraction=null downsample_to_coverage=null useOriginalQualities=false validation_strictness=SILENT unsafe=null max_reads_at_locus=10000 num_threads=1 interval_merging=ALL read_group_black_list=null genotype_model=JOINT_ESTIMATE base_model=EMPIRICAL heterozygosity=7.8E-4 genotype=false output_all_callable_bases=false standard_min_confidence_threshold_for_calling=30.0 standard_min_confidence_threshold_for_emitting=10.0 trigger_min_confidence_threshold_for_calling=30.0 trigger_min_confidence_threshold_for_emitting=30.0 noSLOD=false assume_single_sample_reads=null platform=null min_base_quality_score=20 min_mapping_quality_score=20 max_mismatches_in_40bp_window=3 use_reads_with_bad_mates=false max_deletion_fraction=0.05 cap_base_quality_by_mapping_quality=false"
##VariantFiltration="analysis_type=VariantFiltration input_file=[] read_buffer_size=null read_filter=[] intervals=null excludeIntervals=[chrM, chrY] reference_sequence=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta rodBind=[variant,VCF,wgs.v9/HiSeq.WGS.cleaned.ug.snpfiltered.vcf, mask,Bed,wgs.v9/HiSeq.WGS.cleaned.indels.10.mask] rodToIntervalTrackName=null BTI_merge_rule=UNION DBSNP=null hapmap=null hapmap_chip=null out=wgs.v9/HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.vcf err=null outerr=null filterZeroMappingQualityReads=false downsampling_type=NONE downsample_to_fraction=null downsample_to_coverage=null useOriginalQualities=false validation_strictness=SILENT unsafe=null max_reads_at_locus=2147483647 num_threads=1 interval_merging=ALL read_group_black_list=null filterExpression=[] filterName=[] genotypeFilterExpression=[] genotypeFilterName=[] clusterSize=3 clusterWindowSize=0 maskName=Indel NO_HEADER=false"
##VariantFiltration="analysis_type=VariantFiltration input_file=[] read_buffer_size=null read_filter=[] intervals=null excludeIntervals=[chrM, chrY] reference_sequence=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta rodBind=[variant,VCF,wgs.v9/HiSeq.WGS.cleaned.ug.vcf] rodToIntervalTrackName=null BTI_merge_rule=UNION DBSNP=null hapmap=null hapmap_chip=null out=wgs.v9/HiSeq.WGS.cleaned.ug.snpfiltered.vcf err=null outerr=null filterZeroMappingQualityReads=false downsampling_type=NONE downsample_to_fraction=null downsample_to_coverage=null useOriginalQualities=false validation_strictness=SILENT unsafe=null max_reads_at_locus=2147483647 num_threads=1 interval_merging=ALL read_group_black_list=null filterExpression=[QUAL < 50.0, MQ0 >= 4 && ((MQ0 / (1.0 * DP)) > 0.1), AB > 0.75 && DP > 40, DP > 120 || SB > -0.10] filterName=[LowQual, HARD_TO_VALIDATE, ABFilter, DPFilter] genotypeFilterExpression=[] genotypeFilterName=[] clusterSize=3 clusterWindowSize=10 maskName=Mask NO_HEADER=false"
##source=VariantOptimizer
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878
chr1 109 . A T 0 FDRtranche2.00to10.00+ AC=1;AF=0.50;AN=2;DP=1019;Dels=0.00;HRun=0;HaplotypeScore=686.65;MQ=19.20;MQ0=288;OQ=2175.54;QD=2.13;SB=-1042.18 GT:AD:DP:GL:GQ 0/1:610,327:308:-316.30,-95.47,-803.03:99

0 comments on commit 8fa5da7

Please sign in to comment.