From 8fa5da7d7557485987b3c999afcee058535bc9b8 Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Tue, 5 Sep 2023 14:36:11 -0400 Subject: [PATCH] Support lenient (optimistic) read-only VCF 4.4. --- build.gradle | 15 +++++-- src/main/java/htsjdk/samtools/Defaults.java | 8 +++- .../java/htsjdk/variant/vcf/VCFCodec.java | 12 ++++-- .../htsjdk/variant/vcf/VCFHeaderVersion.java | 5 ++- .../tribble/AbstractFeatureReaderTest.java | 13 +++++- .../htsjdk/variant/vcf/VCFFileReaderTest.java | 17 +++++++- .../htsjdk/variant/VCF4_4HeaderTest.vcf | 42 +++++++++++++++++++ 7 files changed, 101 insertions(+), 11 deletions(-) create mode 100644 src/test/resources/htsjdk/variant/VCF4_4HeaderTest.vcf diff --git a/build.gradle b/build.gradle index 948e7e7b84..c3cf751b72 100644 --- a/build.gradle +++ b/build.gradle @@ -129,17 +129,26 @@ task testWithDefaultReference(type: Test) { } } +task testWithOptimisticVCF4_4(type: Test) { + description = "Run tests with optimistic VCF 4.4 reading" + jvmArgs += '-Dsamjdk.optimistic_vcf_4_4=true' + + useTestNG { + includeGroups "optimistic_vcf_4_4" + } +} + test { description = "Runs the unit tests other than the SRA tests" useTestNG { if (OperatingSystem.current().isUnix()) { - excludeGroups "slow", "broken", "defaultReference", "ftp", "http", "sra", "ena" + excludeGroups "slow", "broken", "defaultReference", "optimistic_vcf_4_4", "ftp", "http", "sra", "ena" } else { - excludeGroups "slow", "broken", "defaultReference", "ftp", "http", "sra", "ena", "unix" + excludeGroups "slow", "broken", "defaultReference", "optimistic_vcf_4_4", "ftp", "http", "sra", "ena", "unix" } } -} dependsOn testWithDefaultReference +} dependsOn testWithDefaultReference, testWithOptimisticVCF4_4 task testFTP(type: Test) { diff --git a/src/main/java/htsjdk/samtools/Defaults.java b/src/main/java/htsjdk/samtools/Defaults.java index 0cfff05d5d..4bf9d87729 100644 --- a/src/main/java/htsjdk/samtools/Defaults.java +++ b/src/main/java/htsjdk/samtools/Defaults.java @@ -59,6 +59,9 @@ public class Defaults { */ public static final String DEFAULT_VCF_EXTENSION; + // accept VCF 4.4 files for read, but treat them as VCF 4.3 + public static final boolean OPTIMISTIC_VCF_4_4; + /** * Even if BUFFER_SIZE is 0, this is guaranteed to be non-zero. If BUFFER_SIZE is non-zero, * this == BUFFER_SIZE (Default = 128k). @@ -105,12 +108,14 @@ public class Defaults { */ public static final String DISABLE_SNAPPY_PROPERTY_NAME = "snappy.disable"; + public static final String OPTIMISTIC_VCF_4_4_PROPERTY = "optimistic_vcf_4_4"; + + /** * Disable use of the Snappy compressor. Default = false. */ public static final boolean DISABLE_SNAPPY_COMPRESSOR; - public static final String SAMJDK_PREFIX = "samjdk."; static { CREATE_INDEX = getBooleanProperty("create_index", false); @@ -134,6 +139,7 @@ public class Defaults { SAM_FLAG_FIELD_FORMAT = SamFlagField.valueOf(getStringProperty("sam_flag_field_format", SamFlagField.DECIMAL.name())); SRA_LIBRARIES_DOWNLOAD = getBooleanProperty("sra_libraries_download", false); DISABLE_SNAPPY_COMPRESSOR = getBooleanProperty(DISABLE_SNAPPY_PROPERTY_NAME, false); + OPTIMISTIC_VCF_4_4 = getBooleanProperty(OPTIMISTIC_VCF_4_4_PROPERTY, false); } /** diff --git a/src/main/java/htsjdk/variant/vcf/VCFCodec.java b/src/main/java/htsjdk/variant/vcf/VCFCodec.java index 42f07150d1..2e6b152908 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFCodec.java +++ b/src/main/java/htsjdk/variant/vcf/VCFCodec.java @@ -25,6 +25,8 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.Defaults; +import htsjdk.samtools.util.Log; import htsjdk.tribble.TribbleException; import htsjdk.tribble.readers.LineIterator; @@ -72,6 +74,8 @@ * @since 2010 */ public class VCFCodec extends AbstractVCFCodec { + private final static Log log = Log.getInstance(VCFCodec.class); + // Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters. public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4"; @@ -96,9 +100,11 @@ public Object readActualHeader(final LineIterator lineIterator) { throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version"); foundHeaderVersion = true; version = VCFHeaderVersion.toHeaderVersion(lineFields[1]); - if ( ! version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) ) - throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4; please use the VCF3 codec for " + lineFields[1]); - if ( version != VCFHeaderVersion.VCF4_0 && version != VCFHeaderVersion.VCF4_1 && version != VCFHeaderVersion.VCF4_2 && version != VCFHeaderVersion.VCF4_3) + if (Defaults.OPTIMISTIC_VCF_4_4 == true && version == VCFHeaderVersion.VCF4_4 ) { + // if optimistic VCFv4.4 is enabled, accept VCFv4.4 as input, but treat it as VCFv4.3, and hope for the best + log.warn("********** VCFv4.4 is not yet fully supported - processing VCFv4.4 input as VCFv4.3! **********"); + version = VCFHeaderVersion.VCF4_3; + } else if ( version != VCFHeaderVersion.VCF4_0 && version != VCFHeaderVersion.VCF4_1 && version != VCFHeaderVersion.VCF4_2 && version != VCFHeaderVersion.VCF4_3) throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4 and does not support " + lineFields[1]); } headerStrings.add(lineIterator.next()); diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java index 43f43c65c3..0f073efcb3 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java @@ -37,7 +37,10 @@ public enum VCFHeaderVersion { VCF4_0("VCFv4.0", "fileformat"), VCF4_1("VCFv4.1", "fileformat"), VCF4_2("VCFv4.2", "fileformat"), - VCF4_3("VCFv4.3", "fileformat"); + VCF4_3("VCFv4.3", "fileformat"), + // VCFv4.4 is not fully supported yet, but we need this to support optimistic reading when + // the "optimistic_vcf_4_4" property is set + VCF4_4("VCFv4.4", "fileformat"); private final String versionString; private final String formatString; diff --git a/src/test/java/htsjdk/tribble/AbstractFeatureReaderTest.java b/src/test/java/htsjdk/tribble/AbstractFeatureReaderTest.java index c789105ba8..f212dbb3d5 100644 --- a/src/test/java/htsjdk/tribble/AbstractFeatureReaderTest.java +++ b/src/test/java/htsjdk/tribble/AbstractFeatureReaderTest.java @@ -11,7 +11,10 @@ import htsjdk.tribble.readers.LineIterator; import htsjdk.variant.VariantBaseTest; import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.VCFCodec; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderVersion; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -97,7 +100,15 @@ public void testBlockCompressionExtension(final String testURIString, final bool public void testBlockCompressionExtensionStringVersion(final String testURIString, final boolean expected) { Assert.assertEquals(AbstractFeatureReader.hasBlockCompressedExtension(testURIString), expected); } - + @Test(groups = "optimistic_vcf_4_4") + public void testVCF4_4Optimistic() { + final AbstractFeatureReader fr = AbstractFeatureReader.getFeatureReader( + Paths.get("src/test/resources/htsjdk/variant/", "VCF4_4HeaderTest.vcf").toString(), + new VCFCodec(), + false); + final VCFHeader vcfHeader = (VCFHeader) fr.getHeader(); + Assert.assertEquals(vcfHeader.getVCFHeaderVersion(), VCFHeaderVersion.VCF4_3); + } @DataProvider(name = "vcfFileAndWrapperCombinations") private static Object[][] vcfFileAndWrapperCombinations(){ diff --git a/src/test/java/htsjdk/variant/vcf/VCFFileReaderTest.java b/src/test/java/htsjdk/variant/vcf/VCFFileReaderTest.java index d848209d99..0bcd1fbc6a 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFFileReaderTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFFileReaderTest.java @@ -3,10 +3,9 @@ import com.google.common.jimfs.Configuration; import com.google.common.jimfs.Jimfs; import htsjdk.HtsjdkTest; -import htsjdk.samtools.seekablestream.SeekableStream; -import htsjdk.samtools.seekablestream.SeekableStreamFactory; import htsjdk.samtools.util.IOUtil; import htsjdk.tribble.TestUtils; +import htsjdk.tribble.TribbleException; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -50,6 +49,10 @@ Object[][] pathsData() { // various ways to refer to a local file {TEST_DATA_DIR + "VCF4HeaderTest.vcf", null, false, true}, + // this file is the same as VCF4HeaderTest.vcf, except the header is marked as VCF 4.4 + // this fails unless the "optimistic_vcf_4_4" property is set, so it's expected to fail here + {TEST_DATA_DIR + "VCF4_4HeaderTest.vcf", null, false, false}, + // // this is almost a vcf, but not quite it's missing the #CHROM line and it has no content... {TEST_DATA_DIR + "Homo_sapiens_assembly38.tile_db_header.vcf", null, false, false}, @@ -102,6 +105,16 @@ public void testCanOpenVCFPathReader(final String file, final String index, fina Assert.assertTrue(shouldSucceed, "Test should have failed but succeeded"); } + @Test(groups = "optimistic_vcf_4_4") + public void testAcceptOptimisticVCF4_4() { + // This file is the same as VCF4HeaderTest.vcf, except the header is marked as VCF 4.4 + // This will fail unless the optimistic_vcf_4_4" property isn't set + try (final VCFFileReader reader = new VCFFileReader(Paths.get(TEST_DATA_DIR.getAbsolutePath(), "VCF4_4HeaderTest.vcf"), false)) { + final VCFHeader header = reader.getFileHeader(); + Assert.assertEquals(header.getVCFHeaderVersion(), VCFHeaderVersion.VCF4_3); + } + } + @Test public void testTabixFileWithEmbeddedSpaces() throws IOException { final File testVCF = new File(TEST_DATA_DIR, "HiSeq.10000.vcf.bgz"); diff --git a/src/test/resources/htsjdk/variant/VCF4_4HeaderTest.vcf b/src/test/resources/htsjdk/variant/VCF4_4HeaderTest.vcf new file mode 100644 index 0000000000..9e79b73b7b --- /dev/null +++ b/src/test/resources/htsjdk/variant/VCF4_4HeaderTest.vcf @@ -0,0 +1,42 @@ +##fileformat=VCFv4.4 +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= 0.06"> +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##UnifiedGenotyper="analysis_type=UnifiedGenotyper input_file=[/humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-23/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-24/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-5/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-9/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-6/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-19/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-25/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-4/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-14/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-22/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-2/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-3/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-7/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-16/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-1/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-17/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-8/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-10/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-18/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-20/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-11/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-15/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-21/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-12/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-13/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam] read_buffer_size=null read_filter=[] intervals=[chrX] excludeIntervals=[chrM, chrY] reference_sequence=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta rodBind=[dbsnp,dbsnp,/humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod, interval,Intervals,chrX] rodToIntervalTrackName=null BTI_merge_rule=UNION DBSNP=/humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod hapmap=null hapmap_chip=null out=null err=null outerr=null filterZeroMappingQualityReads=false downsampling_type=NONE downsample_to_fraction=null downsample_to_coverage=null useOriginalQualities=false validation_strictness=SILENT unsafe=null max_reads_at_locus=10000 num_threads=1 interval_merging=ALL read_group_black_list=null genotype_model=JOINT_ESTIMATE base_model=EMPIRICAL heterozygosity=7.8E-4 genotype=false output_all_callable_bases=false standard_min_confidence_threshold_for_calling=30.0 standard_min_confidence_threshold_for_emitting=10.0 trigger_min_confidence_threshold_for_calling=30.0 trigger_min_confidence_threshold_for_emitting=30.0 noSLOD=false assume_single_sample_reads=null platform=null min_base_quality_score=20 min_mapping_quality_score=20 max_mismatches_in_40bp_window=3 use_reads_with_bad_mates=false max_deletion_fraction=0.05 cap_base_quality_by_mapping_quality=false" +##VariantFiltration="analysis_type=VariantFiltration input_file=[] read_buffer_size=null read_filter=[] intervals=null excludeIntervals=[chrM, chrY] reference_sequence=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta rodBind=[variant,VCF,wgs.v9/HiSeq.WGS.cleaned.ug.snpfiltered.vcf, mask,Bed,wgs.v9/HiSeq.WGS.cleaned.indels.10.mask] rodToIntervalTrackName=null BTI_merge_rule=UNION DBSNP=null hapmap=null hapmap_chip=null out=wgs.v9/HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.vcf err=null outerr=null filterZeroMappingQualityReads=false downsampling_type=NONE downsample_to_fraction=null downsample_to_coverage=null useOriginalQualities=false validation_strictness=SILENT unsafe=null max_reads_at_locus=2147483647 num_threads=1 interval_merging=ALL read_group_black_list=null filterExpression=[] filterName=[] genotypeFilterExpression=[] genotypeFilterName=[] clusterSize=3 clusterWindowSize=0 maskName=Indel NO_HEADER=false" +##VariantFiltration="analysis_type=VariantFiltration input_file=[] read_buffer_size=null read_filter=[] intervals=null excludeIntervals=[chrM, chrY] reference_sequence=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta rodBind=[variant,VCF,wgs.v9/HiSeq.WGS.cleaned.ug.vcf] rodToIntervalTrackName=null BTI_merge_rule=UNION DBSNP=null hapmap=null hapmap_chip=null out=wgs.v9/HiSeq.WGS.cleaned.ug.snpfiltered.vcf err=null outerr=null filterZeroMappingQualityReads=false downsampling_type=NONE downsample_to_fraction=null downsample_to_coverage=null useOriginalQualities=false validation_strictness=SILENT unsafe=null max_reads_at_locus=2147483647 num_threads=1 interval_merging=ALL read_group_black_list=null filterExpression=[QUAL < 50.0, MQ0 >= 4 && ((MQ0 / (1.0 * DP)) > 0.1), AB > 0.75 && DP > 40, DP > 120 || SB > -0.10] filterName=[LowQual, HARD_TO_VALIDATE, ABFilter, DPFilter] genotypeFilterExpression=[] genotypeFilterName=[] clusterSize=3 clusterWindowSize=10 maskName=Mask NO_HEADER=false" +##source=VariantOptimizer +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 +chr1 109 . A T 0 FDRtranche2.00to10.00+ AC=1;AF=0.50;AN=2;DP=1019;Dels=0.00;HRun=0;HaplotypeScore=686.65;MQ=19.20;MQ0=288;OQ=2175.54;QD=2.13;SB=-1042.18 GT:AD:DP:GL:GQ 0/1:610,327:308:-316.30,-95.47,-803.03:99