Skip to content

Commit

Permalink
Fix performance issue when creating a VCF header with a large number …
Browse files Browse the repository at this point in the history
…of contig sequences. (#968)
  • Loading branch information
cmnbroad authored Aug 16, 2017
1 parent 2b5736f commit d5af563
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 16 deletions.
26 changes: 11 additions & 15 deletions src/main/java/htsjdk/variant/vcf/VCFHeader.java
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ public enum HEADER_FIELDS {
private final Map<String, VCFFormatHeaderLine> mFormatMetaData = new LinkedHashMap<String, VCFFormatHeaderLine>();
private final Map<String, VCFFilterHeaderLine> mFilterMetaData = new LinkedHashMap<String, VCFFilterHeaderLine>();
private final Map<String, VCFHeaderLine> mOtherMetaData = new LinkedHashMap<String, VCFHeaderLine>();
private final List<VCFContigHeaderLine> contigMetaData = new ArrayList<VCFContigHeaderLine>();
private final Map<String, VCFContigHeaderLine> contigMetaData = new LinkedHashMap<>();

// the list of auxillary tags
private final List<String> mGenotypeSampleNames = new ArrayList<String>();
Expand Down Expand Up @@ -188,7 +188,8 @@ public void addMetaDataLine(final VCFHeaderLine headerLine) {
* @return all of the VCF header lines of the ##contig form in order, or an empty list if none were present
*/
public List<VCFContigHeaderLine> getContigLines() {
return Collections.unmodifiableList(contigMetaData);
// this must preserve input order
return Collections.unmodifiableList(new ArrayList<>(contigMetaData.values()));
}

/**
Expand Down Expand Up @@ -223,10 +224,8 @@ public void setSequenceDictionary(final SAMSequenceDictionary dictionary) {
}
mMetaData.removeAll(toRemove);
for (final SAMSequenceRecord record : dictionary.getSequences()) {
contigMetaData.add(new VCFContigHeaderLine(record, record.getAssembly()));
addMetaDataLine(new VCFContigHeaderLine(record, record.getAssembly()));
}

this.mMetaData.addAll(contigMetaData);
}

public VariantContextComparator getVCFRecordComparator() {
Expand Down Expand Up @@ -321,18 +320,15 @@ private boolean addMetadataLineLookupEntry(final VCFHeaderLine line) {
* @return true if line was added to the list of contig lines, otherwise false
*/
private boolean addContigMetaDataLineLookupEntry(final VCFContigHeaderLine line) {
for (VCFContigHeaderLine vcfContigHeaderLine : contigMetaData) {
// if we are trying to add a contig for the same ID
if (vcfContigHeaderLine.getID().equals(line.getID())) {
if ( GeneralUtils.DEBUG_MODE_ENABLED ) {
System.err.println("Found duplicate VCF contig header lines for " + line.getID() + "; keeping the first only" );
}
// do not add this contig if it exists
return false;
// if we are trying to add a contig for the same ID
if (contigMetaData.containsKey(line.getID())) {
if ( GeneralUtils.DEBUG_MODE_ENABLED ) {
System.err.println("Found duplicate VCF contig header lines for " + line.getID() + "; keeping the first only" );
}
// do not add this contig if it exists
return false;
}

contigMetaData.add(line);
contigMetaData.put(line.getID(), line);
return true;
}

Expand Down
33 changes: 32 additions & 1 deletion src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,8 @@ public void testVCFHeaderAddFilterLine() {
@Test
public void testVCFHeaderAddContigLine() {
final VCFHeader header = getHiSeqVCFHeader();
final VCFContigHeaderLine contigLine = new VCFContigHeaderLine("<ID=chr1,length=1234567890,assembly=FAKE,md5=f126cdf8a6e0c7f379d618ff66beb2da,species=\"Homo sapiens\">", VCFHeaderVersion.VCF4_0, "chr1", 0);
final VCFContigHeaderLine contigLine = new VCFContigHeaderLine(
"<ID=chr1,length=1234567890,assembly=FAKE,md5=f126cdf8a6e0c7f379d618ff66beb2da,species=\"Homo sapiens\">", VCFHeaderVersion.VCF4_0, VCFHeader.CONTIG_KEY, 0);
header.addMetaDataLine(contigLine);

Assert.assertTrue(header.getContigLines().contains(contigLine), "Test contig line not found in contig header lines");
Expand All @@ -235,6 +236,36 @@ public void testVCFHeaderAddContigLine() {
Assert.assertFalse(header.getOtherHeaderLines().contains(contigLine), "Test contig line present in other header lines");
}

@Test
public void testVCFHeaderHonorContigLineOrder() throws IOException {
try (final VCFFileReader vcfReader = new VCFFileReader(new File(variantTestDataRoot + "dbsnp_135.b37.1000.vcf"), false)) {
// start with a header with a bunch of contig lines
final VCFHeader header = vcfReader.getFileHeader();
final List<VCFContigHeaderLine> originalHeaderList = header.getContigLines();
Assert.assertTrue(originalHeaderList.size() > 0);

// copy the contig lines to a new list, sticking an extra contig line in the middle
final List<VCFContigHeaderLine> orderedList = new ArrayList<>();
final int splitInTheMiddle = originalHeaderList.size() / 2;
orderedList.addAll(originalHeaderList.subList(0, splitInTheMiddle));
final VCFContigHeaderLine outrageousContigLine = new VCFContigHeaderLine(
"<ID=outrageousID,length=1234567890,assembly=FAKE,md5=f126cdf8a6e0c7f379d618ff66beb2da,species=\"Homo sapiens\">",
VCFHeaderVersion.VCF4_2,
VCFHeader.CONTIG_KEY,
0);
orderedList.add(outrageousContigLine);
// make sure the extra contig line is outrageous enough to not collide with a real contig ID
Assert.assertTrue(orderedList.contains(outrageousContigLine));
orderedList.addAll(originalHeaderList.subList(splitInTheMiddle, originalHeaderList.size()));
Assert.assertEquals(originalHeaderList.size() + 1, orderedList.size());

// crete a new header from the ordered list, and test that getContigLines honors the input order
final VCFHeader orderedHeader = new VCFHeader();
orderedList.forEach(hl -> orderedHeader.addMetaDataLine(hl));
Assert.assertEquals(orderedList, orderedHeader.getContigLines());
}
}

@Test
public void testVCFHeaderAddOtherLine() {
final VCFHeader header = getHiSeqVCFHeader();
Expand Down

0 comments on commit d5af563

Please sign in to comment.