From 1f0249ddf0e227eeebefc5723c940bd58e32b3af Mon Sep 17 00:00:00 2001 From: Chris Whelan Date: Thu, 19 Sep 2019 11:07:43 -0400 Subject: [PATCH 1/3] Make VCFHeader not throw exception if contig header lines lack length field --- .../java/htsjdk/variant/vcf/VCFContigHeaderLine.java | 7 ++++++- src/main/java/htsjdk/variant/vcf/VCFHeader.java | 6 ++++-- .../java/htsjdk/variant/vcf/VCFHeaderUnitTest.java | 12 ++++++++++++ 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java index 2cbc40ea28..730cb32dde 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java @@ -76,9 +76,14 @@ public Integer getContigIndex() { return contigIndex; } + /** + * Get the SAMSequenceRecord that corresponds to this VCF header line. + * @return The SAMSequenceRecord containing the ID, length, assembly, and index of this contig. Returns null if the + * contig header line does not have a length. + */ public SAMSequenceRecord getSAMSequenceRecord() { final String lengthString = this.getGenericFieldValue("length"); - if (lengthString == null) throw new TribbleException("Contig " + this.getID() + " does not have a length field."); + if (lengthString == null) return null; final SAMSequenceRecord record = new SAMSequenceRecord(this.getID(), Integer.parseInt(lengthString)); record.setAssembly(this.getGenericFieldValue("assembly")); record.setSequenceIndex(this.contigIndex); diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeader.java b/src/main/java/htsjdk/variant/vcf/VCFHeader.java index c023e5bbf2..266b23a5c5 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeader.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeader.java @@ -261,7 +261,7 @@ public List getContigLines() { /** * Returns the contigs in this VCF file as a SAMSequenceDictionary. Returns null if contigs lines are - * not present in the header. Throws SAMException if one or more contig lines do not have length + * not present in the header. Returns null if one or more contig lines do not have length * information. */ public SAMSequenceDictionary getSequenceDictionary() { @@ -270,7 +270,9 @@ public SAMSequenceDictionary getSequenceDictionary() { final List sequenceRecords = new ArrayList(contigHeaderLines.size()); for (final VCFContigHeaderLine contigHeaderLine : contigHeaderLines) { - sequenceRecords.add(contigHeaderLine.getSAMSequenceRecord()); + final SAMSequenceRecord samSequenceRecord = contigHeaderLine.getSAMSequenceRecord(); + if (samSequenceRecord == null) return null; + sequenceRecords.add(samSequenceRecord); } return new SAMSequenceDictionary(sequenceRecords); diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java index 5bb42a3205..5b96f6c23f 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java @@ -239,6 +239,18 @@ public void testVCFHeaderAddContigLine() { } @Test + public void testVCFHeaderContigLineMissingLength() { + final VCFHeader header = getHiSeqVCFHeader(); + final VCFContigHeaderLine contigLine = new VCFContigHeaderLine( + "", VCFHeaderVersion.VCF4_0, VCFHeader.CONTIG_KEY, 0); + header.addMetaDataLine(contigLine); + Assert.assertTrue(header.getContigLines().contains(contigLine), "Test contig line not found in contig header lines"); + Assert.assertTrue(header.getMetaDataInInputOrder().contains(contigLine), "Test contig line not found in set of all header lines"); + + Assert.assertNull(header.getSequenceDictionary()); + } + + @Test public void testVCFHeaderHonorContigLineOrder() throws IOException { try (final VCFFileReader vcfReader = new VCFFileReader(new File(variantTestDataRoot + "dbsnp_135.b37.1000.vcf"), false)) { // start with a header with a bunch of contig lines From c8150b79f8f52bad4c0416728f566306f74fb3b5 Mon Sep 17 00:00:00 2001 From: Chris Whelan Date: Mon, 30 Sep 2019 21:33:41 -0400 Subject: [PATCH 2/3] Per PR comments, change to returning a header line with length set to SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH --- .../java/htsjdk/variant/vcf/VCFContigHeaderLine.java | 12 ++++++++++-- .../java/htsjdk/variant/vcf/VCFHeaderUnitTest.java | 7 ++++++- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java index 730cb32dde..9ec50681b4 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java @@ -78,13 +78,21 @@ public Integer getContigIndex() { /** * Get the SAMSequenceRecord that corresponds to this VCF header line. + * If the VCF header line does not have a length tag, the SAMSequenceRecord returned will be set to have a length of + * SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH. Records with unknown length will match any record with the same name + * when evaluated by SAMSequenceRecord.isSameSequence. * @return The SAMSequenceRecord containing the ID, length, assembly, and index of this contig. Returns null if the * contig header line does not have a length. */ public SAMSequenceRecord getSAMSequenceRecord() { final String lengthString = this.getGenericFieldValue("length"); - if (lengthString == null) return null; - final SAMSequenceRecord record = new SAMSequenceRecord(this.getID(), Integer.parseInt(lengthString)); + final int length; + if (lengthString == null) { + length = SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH; + } else { + length = Integer.parseInt(lengthString); + } + final SAMSequenceRecord record = new SAMSequenceRecord(this.getID(), length); record.setAssembly(this.getGenericFieldValue("assembly")); record.setSequenceIndex(this.contigIndex); return record; diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java index 5b96f6c23f..e4d5099eda 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java @@ -25,6 +25,8 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.FileExtensions; import htsjdk.samtools.util.IOUtil; @@ -247,7 +249,10 @@ public void testVCFHeaderContigLineMissingLength() { Assert.assertTrue(header.getContigLines().contains(contigLine), "Test contig line not found in contig header lines"); Assert.assertTrue(header.getMetaDataInInputOrder().contains(contigLine), "Test contig line not found in set of all header lines"); - Assert.assertNull(header.getSequenceDictionary()); + final SAMSequenceDictionary sequenceDictionary = header.getSequenceDictionary(); + Assert.assertNotNull(sequenceDictionary); + Assert.assertEquals(sequenceDictionary.getSequence("chr1").getSequenceLength(), SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH); + } @Test From ef6f24462f7e806d14f7af8b42860ff0216d3ad1 Mon Sep 17 00:00:00 2001 From: Chris Whelan Date: Mon, 7 Oct 2019 09:58:00 -0400 Subject: [PATCH 3/3] Update comment and assertion to reflect the fact that ContigHeaderLine.getSAMSequenceRecord will no longer return null --- src/main/java/htsjdk/variant/vcf/VCFHeader.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeader.java b/src/main/java/htsjdk/variant/vcf/VCFHeader.java index 266b23a5c5..be7469b74b 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeader.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeader.java @@ -261,8 +261,9 @@ public List getContigLines() { /** * Returns the contigs in this VCF file as a SAMSequenceDictionary. Returns null if contigs lines are - * not present in the header. Returns null if one or more contig lines do not have length - * information. + * not present in the header. If contig lines are missing length tags, they will be created with + * length set to SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH. Records with unknown length will match any record with + * the same name when evaluated by SAMSequenceRecord.isSameSequence. */ public SAMSequenceDictionary getSequenceDictionary() { final List contigHeaderLines = this.getContigLines(); @@ -271,7 +272,6 @@ public SAMSequenceDictionary getSequenceDictionary() { final List sequenceRecords = new ArrayList(contigHeaderLines.size()); for (final VCFContigHeaderLine contigHeaderLine : contigHeaderLines) { final SAMSequenceRecord samSequenceRecord = contigHeaderLine.getSAMSequenceRecord(); - if (samSequenceRecord == null) return null; sequenceRecords.add(samSequenceRecord); }