From bc70a087c09ac152945bff7abfb3bc9e38aa433e Mon Sep 17 00:00:00 2001 From: Tom White Date: Mon, 8 Apr 2019 11:10:31 +0100 Subject: [PATCH 1/2] Support writing a CRAI index from CRAMContainerStreamWriter --- .../htsjdk/samtools/AbstractCRAMIndexer.java | 20 ++++++ .../java/htsjdk/samtools/CRAMBAIIndexer.java | 2 +- .../java/htsjdk/samtools/CRAMCRAIIndexer.java | 6 +- .../samtools/CRAMContainerStreamWriter.java | 27 ++++++-- .../CRAMContainerStreamWriterTest.java | 65 ++++++++++++++++++- 5 files changed, 111 insertions(+), 9 deletions(-) create mode 100644 src/main/java/htsjdk/samtools/AbstractCRAMIndexer.java diff --git a/src/main/java/htsjdk/samtools/AbstractCRAMIndexer.java b/src/main/java/htsjdk/samtools/AbstractCRAMIndexer.java new file mode 100644 index 0000000000..16b9e0920c --- /dev/null +++ b/src/main/java/htsjdk/samtools/AbstractCRAMIndexer.java @@ -0,0 +1,20 @@ +package htsjdk.samtools; + +import htsjdk.samtools.cram.structure.Container; + +/** + * Base class for indexing CRAM. + */ +public abstract class AbstractCRAMIndexer { + /** + * Create index entries for a single container. + * @param container the container to index + * @param validationStringency stringency for validating records, passed to {@link Container#getSpans(ValidationStringency)} + */ + abstract void processContainer(final Container container, final ValidationStringency validationStringency); + + /** + * Finish creating the index by writing the accumulated entries out. + */ + public abstract void finish(); +} diff --git a/src/main/java/htsjdk/samtools/CRAMBAIIndexer.java b/src/main/java/htsjdk/samtools/CRAMBAIIndexer.java index cd0cf3fe69..8bc1da07c7 100755 --- a/src/main/java/htsjdk/samtools/CRAMBAIIndexer.java +++ b/src/main/java/htsjdk/samtools/CRAMBAIIndexer.java @@ -79,7 +79,7 @@ * but it is unused. This would be accomplished via {@link #createIndex(SeekableStream, File, Log, ValidationStringency)}. * */ -public class CRAMBAIIndexer { +public class CRAMBAIIndexer extends AbstractCRAMIndexer { // The number of references (chromosomes) in the BAM file private final int numReferences; diff --git a/src/main/java/htsjdk/samtools/CRAMCRAIIndexer.java b/src/main/java/htsjdk/samtools/CRAMCRAIIndexer.java index 0ac24929ee..bfab545d46 100644 --- a/src/main/java/htsjdk/samtools/CRAMCRAIIndexer.java +++ b/src/main/java/htsjdk/samtools/CRAMCRAIIndexer.java @@ -26,7 +26,7 @@ *
  • read an existing index from an input stream
  • *

    */ -public class CRAMCRAIIndexer { +public class CRAMCRAIIndexer extends AbstractCRAMIndexer { final private CRAIIndex craiIndex = new CRAIIndex(); final private GZIPOutputStream os; @@ -72,6 +72,10 @@ public void processContainer(final Container container) { craiIndex.processContainer(container); } + void processContainer(final Container container, final ValidationStringency validationStringency) { + processContainer(container); + } + /** * Finish creating the index by writing the accumulated entries out to the stream. */ diff --git a/src/main/java/htsjdk/samtools/CRAMContainerStreamWriter.java b/src/main/java/htsjdk/samtools/CRAMContainerStreamWriter.java index b87744e45d..5b667dc89f 100644 --- a/src/main/java/htsjdk/samtools/CRAMContainerStreamWriter.java +++ b/src/main/java/htsjdk/samtools/CRAMContainerStreamWriter.java @@ -59,7 +59,7 @@ public class CRAMContainerStreamWriter { private Set captureTags = new TreeSet<>(); private Set ignoreTags = new TreeSet<>(); - private CRAMBAIIndexer indexer; + private AbstractCRAMIndexer indexer; private long offset; /** @@ -78,14 +78,31 @@ public CRAMContainerStreamWriter( final CRAMReferenceSource source, final SAMFileHeader samFileHeader, final String cramId) { + this(outputStream, source, samFileHeader, cramId, indexStream == null ? null : new CRAMBAIIndexer(indexStream, samFileHeader)); + } + + /** + * Create a CRAMContainerStreamWriter for writing SAM records into a series of CRAM + * containers on output stream, with an optional index. + * + * @param outputStream where to write the CRAM stream. + * @param source reference source + * @param samFileHeader {@link SAMFileHeader} to be used. Sort order is determined by the sortOrder property of this arg. + * @param cramId used for display in error message display + * @param indexer CRAM indexer. Can be null if no index is required. + */ + public CRAMContainerStreamWriter( + final OutputStream outputStream, + final CRAMReferenceSource source, + final SAMFileHeader samFileHeader, + final String cramId, + final AbstractCRAMIndexer indexer) { this.outputStream = outputStream; + this.source = source; this.samFileHeader = samFileHeader; this.cramID = cramId; - this.source = source; + this.indexer = indexer; containerFactory = new ContainerFactory(samFileHeader, recordsPerSlice); - if (indexStream != null) { - indexer = new CRAMBAIIndexer(indexStream, samFileHeader); - } } /** diff --git a/src/test/java/htsjdk/samtools/CRAMContainerStreamWriterTest.java b/src/test/java/htsjdk/samtools/CRAMContainerStreamWriterTest.java index 9ab9ed2784..5479300abc 100644 --- a/src/test/java/htsjdk/samtools/CRAMContainerStreamWriterTest.java +++ b/src/test/java/htsjdk/samtools/CRAMContainerStreamWriterTest.java @@ -91,6 +91,28 @@ private void doTest(final List samRecords, final ByteArrayOutputStrea Assert.assertEquals(count, samRecords.size()); } + private void doTest(final List samRecords, final ByteArrayOutputStream outStream, final SAMFileHeader header, final AbstractCRAMIndexer indexer) { + final ReferenceSource refSource = createReferenceSource(); + + final CRAMContainerStreamWriter containerStream = new CRAMContainerStreamWriter(outStream, refSource, header, "test", indexer); + containerStream.writeHeader(header); + + for (SAMRecord record : samRecords) { + containerStream.writeAlignment(record); + } + containerStream.finish(true); // finish and issue EOF + + // read all the records back in + final CRAMFileReader cReader = new CRAMFileReader(null, new ByteArrayInputStream(outStream.toByteArray()), refSource); + final SAMRecordIterator iterator = cReader.getIterator(); + int count = 0; + while (iterator.hasNext()) { + SAMRecord actualRecord = iterator.next(); + count++; + } + Assert.assertEquals(count, samRecords.size()); + } + @Test(description = "Test CRAMContainerStream no index") public void testCRAMContainerStreamNoIndex() { final List samRecords = createRecords(100); @@ -144,8 +166,8 @@ public void testCRAMContainerAggregatePartitions() throws IOException { Assert.assertEquals(count, nRecs); } - @Test(description = "Test CRAMContainerStream with index") - public void testCRAMContainerStreamWithIndex() throws IOException { + @Test(description = "Test CRAMContainerStream with bai index") + public void testCRAMContainerStreamWithBaiIndex() throws IOException { final List samRecords = createRecords(100); final ByteArrayOutputStream outStream = new ByteArrayOutputStream(); final ByteArrayOutputStream indexStream = new ByteArrayOutputStream(); @@ -182,4 +204,43 @@ public void testCRAMContainerStreamWithIndex() throws IOException { Assert.assertEquals(count, 2); } + @Test(description = "Test CRAMContainerStream with crai index") + public void testCRAMContainerStreamWithCraiIndex() throws IOException { + final List samRecords = createRecords(100); + final SAMFileHeader header = createSAMHeader(SAMFileHeader.SortOrder.coordinate); + final ByteArrayOutputStream outStream = new ByteArrayOutputStream(); + final ByteArrayOutputStream indexStream = new ByteArrayOutputStream(); + doTest(samRecords, outStream, header, new CRAMCRAIIndexer(indexStream, header)); + outStream.close(); + indexStream.close(); + + // write the file out + final File cramTempFile = File.createTempFile("cramContainerStreamTest", ".cram"); + cramTempFile.deleteOnExit(); + final OutputStream cramFileStream = new FileOutputStream(cramTempFile); + cramFileStream.write(outStream.toByteArray()); + cramFileStream.close(); + + // write the index out + final File indexTempFile = File.createTempFile("cramContainerStreamTest", ".crai"); + indexTempFile.deleteOnExit(); + OutputStream indexFileStream = new FileOutputStream(indexTempFile); + indexFileStream.write(indexStream.toByteArray()); + indexFileStream.close(); + + final ReferenceSource refSource = createReferenceSource(); + final CRAMFileReader reader = new CRAMFileReader( + cramTempFile, + indexTempFile, + refSource, + ValidationStringency.SILENT); + final CloseableIterator iterator = reader.query(new QueryInterval[]{new QueryInterval(1, 10, 10)}, false); + int count = 0; + while (iterator.hasNext()) { + SAMRecord actualRecord = iterator.next(); + count++; + } + Assert.assertEquals(count, 2); + } + } From d9a9bc7dfed4936dc76f2b222cd3feeeb2aea047 Mon Sep 17 00:00:00 2001 From: Tom White Date: Fri, 26 Apr 2019 14:47:06 +0100 Subject: [PATCH 2/2] Respond to feedback --- .../java/htsjdk/samtools/CRAMBAIIndexer.java | 6 +- .../java/htsjdk/samtools/CRAMCRAIIndexer.java | 6 +- .../samtools/CRAMContainerStreamWriter.java | 4 +- ...tractCRAMIndexer.java => CRAMIndexer.java} | 8 +- .../CRAMContainerStreamWriterTest.java | 78 ++++++------------- 5 files changed, 36 insertions(+), 66 deletions(-) rename src/main/java/htsjdk/samtools/{AbstractCRAMIndexer.java => CRAMIndexer.java} (65%) diff --git a/src/main/java/htsjdk/samtools/CRAMBAIIndexer.java b/src/main/java/htsjdk/samtools/CRAMBAIIndexer.java index 8bc1da07c7..4448f81adc 100755 --- a/src/main/java/htsjdk/samtools/CRAMBAIIndexer.java +++ b/src/main/java/htsjdk/samtools/CRAMBAIIndexer.java @@ -79,7 +79,7 @@ * but it is unused. This would be accomplished via {@link #createIndex(SeekableStream, File, Log, ValidationStringency)}. * */ -public class CRAMBAIIndexer extends AbstractCRAMIndexer { +public class CRAMBAIIndexer implements CRAMIndexer { // The number of references (chromosomes) in the BAM file private final int numReferences; @@ -130,7 +130,8 @@ public CRAMBAIIndexer(final OutputStream output, final SAMFileHeader fileHeader) * * @param container container to be indexed */ - void processContainer(final Container container, final ValidationStringency validationStringency) { + @Override + public void processContainer(final Container container, final ValidationStringency validationStringency) { if (container == null || container.isEOF()) { return; } @@ -226,6 +227,7 @@ public void processAsSingleReferenceSlice(final Slice slice) { * After all the slices have been processed, finish is called. * Writes any final information and closes the output file. */ + @Override public void finish() { // process any remaining references advanceToReference(numReferences); diff --git a/src/main/java/htsjdk/samtools/CRAMCRAIIndexer.java b/src/main/java/htsjdk/samtools/CRAMCRAIIndexer.java index bfab545d46..a1f8cd7486 100644 --- a/src/main/java/htsjdk/samtools/CRAMCRAIIndexer.java +++ b/src/main/java/htsjdk/samtools/CRAMCRAIIndexer.java @@ -26,7 +26,7 @@ *

  • read an existing index from an input stream
  • *

    */ -public class CRAMCRAIIndexer extends AbstractCRAMIndexer { +public class CRAMCRAIIndexer implements CRAMIndexer { final private CRAIIndex craiIndex = new CRAIIndex(); final private GZIPOutputStream os; @@ -72,13 +72,15 @@ public void processContainer(final Container container) { craiIndex.processContainer(container); } - void processContainer(final Container container, final ValidationStringency validationStringency) { + @Override + public void processContainer(final Container container, final ValidationStringency validationStringency) { processContainer(container); } /** * Finish creating the index by writing the accumulated entries out to the stream. */ + @Override public void finish() { try { craiIndex.writeIndex(os); diff --git a/src/main/java/htsjdk/samtools/CRAMContainerStreamWriter.java b/src/main/java/htsjdk/samtools/CRAMContainerStreamWriter.java index 5b667dc89f..509695ab4d 100644 --- a/src/main/java/htsjdk/samtools/CRAMContainerStreamWriter.java +++ b/src/main/java/htsjdk/samtools/CRAMContainerStreamWriter.java @@ -59,7 +59,7 @@ public class CRAMContainerStreamWriter { private Set captureTags = new TreeSet<>(); private Set ignoreTags = new TreeSet<>(); - private AbstractCRAMIndexer indexer; + private CRAMIndexer indexer; private long offset; /** @@ -96,7 +96,7 @@ public CRAMContainerStreamWriter( final CRAMReferenceSource source, final SAMFileHeader samFileHeader, final String cramId, - final AbstractCRAMIndexer indexer) { + final CRAMIndexer indexer) { this.outputStream = outputStream; this.source = source; this.samFileHeader = samFileHeader; diff --git a/src/main/java/htsjdk/samtools/AbstractCRAMIndexer.java b/src/main/java/htsjdk/samtools/CRAMIndexer.java similarity index 65% rename from src/main/java/htsjdk/samtools/AbstractCRAMIndexer.java rename to src/main/java/htsjdk/samtools/CRAMIndexer.java index 16b9e0920c..d59e8a1c1e 100644 --- a/src/main/java/htsjdk/samtools/AbstractCRAMIndexer.java +++ b/src/main/java/htsjdk/samtools/CRAMIndexer.java @@ -3,18 +3,18 @@ import htsjdk.samtools.cram.structure.Container; /** - * Base class for indexing CRAM. + * Interface for indexing CRAM. */ -public abstract class AbstractCRAMIndexer { +public interface CRAMIndexer { /** * Create index entries for a single container. * @param container the container to index * @param validationStringency stringency for validating records, passed to {@link Container#getSpans(ValidationStringency)} */ - abstract void processContainer(final Container container, final ValidationStringency validationStringency); + void processContainer(final Container container, final ValidationStringency validationStringency); /** * Finish creating the index by writing the accumulated entries out. */ - public abstract void finish(); + void finish(); } diff --git a/src/test/java/htsjdk/samtools/CRAMContainerStreamWriterTest.java b/src/test/java/htsjdk/samtools/CRAMContainerStreamWriterTest.java index 5479300abc..3b7d1b7bbd 100644 --- a/src/test/java/htsjdk/samtools/CRAMContainerStreamWriterTest.java +++ b/src/test/java/htsjdk/samtools/CRAMContainerStreamWriterTest.java @@ -3,11 +3,9 @@ import htsjdk.HtsjdkTest; import htsjdk.samtools.cram.ref.ReferenceSource; import htsjdk.samtools.reference.InMemoryReferenceSequenceFile; -import htsjdk.samtools.seekablestream.SeekableMemoryStream; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.Log.LogLevel; -import htsjdk.samtools.util.RuntimeIOException; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @@ -18,7 +16,6 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; -import java.io.StringWriter; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -75,28 +72,19 @@ private void doTest(final List samRecords, final ByteArrayOutputStrea final CRAMContainerStreamWriter containerStream = new CRAMContainerStreamWriter(outStream, indexStream, refSource, header, "test"); containerStream.writeHeader(header); - for (SAMRecord record : samRecords) { - containerStream.writeAlignment(record); - } - containerStream.finish(true); // finish and issue EOF - - // read all the records back in - final CRAMFileReader cReader = new CRAMFileReader(null, new ByteArrayInputStream(outStream.toByteArray()), refSource); - final SAMRecordIterator iterator = cReader.getIterator(); - int count = 0; - while (iterator.hasNext()) { - SAMRecord actualRecord = iterator.next(); - count++; - } - Assert.assertEquals(count, samRecords.size()); + writeThenReadRecords(samRecords, outStream, refSource, containerStream); } - private void doTest(final List samRecords, final ByteArrayOutputStream outStream, final SAMFileHeader header, final AbstractCRAMIndexer indexer) { + private void doTestWithIndexer(final List samRecords, final ByteArrayOutputStream outStream, final SAMFileHeader header, final CRAMIndexer indexer) { final ReferenceSource refSource = createReferenceSource(); final CRAMContainerStreamWriter containerStream = new CRAMContainerStreamWriter(outStream, refSource, header, "test", indexer); containerStream.writeHeader(header); + writeThenReadRecords(samRecords, outStream, refSource, containerStream); + } + + private void writeThenReadRecords(List samRecords, ByteArrayOutputStream outStream, ReferenceSource refSource, CRAMContainerStreamWriter containerStream) { for (SAMRecord record : samRecords) { containerStream.writeAlignment(record); } @@ -169,51 +157,29 @@ public void testCRAMContainerAggregatePartitions() throws IOException { @Test(description = "Test CRAMContainerStream with bai index") public void testCRAMContainerStreamWithBaiIndex() throws IOException { final List samRecords = createRecords(100); - final ByteArrayOutputStream outStream = new ByteArrayOutputStream(); - final ByteArrayOutputStream indexStream = new ByteArrayOutputStream(); - doTest(samRecords, outStream, indexStream); - outStream.close(); - indexStream.close(); - - // write the file out - final File cramTempFile = File.createTempFile("cramContainerStreamTest", ".cram"); - cramTempFile.deleteOnExit(); - final OutputStream cramFileStream = new FileOutputStream(cramTempFile); - cramFileStream.write(outStream.toByteArray()); - cramFileStream.close(); - - // write the index out - final File indexTempFile = File.createTempFile("cramContainerStreamTest", ".bai"); - indexTempFile.deleteOnExit(); - OutputStream indexFileStream = new FileOutputStream(indexTempFile); - indexFileStream.write(indexStream.toByteArray()); - indexFileStream.close(); - - final ReferenceSource refSource = createReferenceSource(); - final CRAMFileReader reader = new CRAMFileReader( - cramTempFile, - indexTempFile, - refSource, - ValidationStringency.SILENT); - final CloseableIterator iterator = reader.query(new QueryInterval[]{new QueryInterval(1, 10, 10)}, false); - int count = 0; - while (iterator.hasNext()) { - SAMRecord actualRecord = iterator.next(); - count++; + try (ByteArrayOutputStream outStream = new ByteArrayOutputStream(); + ByteArrayOutputStream indexStream = new ByteArrayOutputStream()) { + doTest(samRecords, outStream, indexStream); + outStream.flush(); + indexStream.flush(); + checkCRAMContainerStream(outStream, indexStream, ".bai"); } - Assert.assertEquals(count, 2); } @Test(description = "Test CRAMContainerStream with crai index") public void testCRAMContainerStreamWithCraiIndex() throws IOException { final List samRecords = createRecords(100); final SAMFileHeader header = createSAMHeader(SAMFileHeader.SortOrder.coordinate); - final ByteArrayOutputStream outStream = new ByteArrayOutputStream(); - final ByteArrayOutputStream indexStream = new ByteArrayOutputStream(); - doTest(samRecords, outStream, header, new CRAMCRAIIndexer(indexStream, header)); - outStream.close(); - indexStream.close(); + try (ByteArrayOutputStream outStream = new ByteArrayOutputStream(); + ByteArrayOutputStream indexStream = new ByteArrayOutputStream()) { + doTestWithIndexer(samRecords, outStream, header, new CRAMCRAIIndexer(indexStream, header)); + outStream.flush(); + indexStream.flush(); + checkCRAMContainerStream(outStream, indexStream, ".crai"); + } + } + private void checkCRAMContainerStream(ByteArrayOutputStream outStream, ByteArrayOutputStream indexStream, String indexExtension) throws IOException { // write the file out final File cramTempFile = File.createTempFile("cramContainerStreamTest", ".cram"); cramTempFile.deleteOnExit(); @@ -222,7 +188,7 @@ public void testCRAMContainerStreamWithCraiIndex() throws IOException { cramFileStream.close(); // write the index out - final File indexTempFile = File.createTempFile("cramContainerStreamTest", ".crai"); + final File indexTempFile = File.createTempFile("cramContainerStreamTest", indexExtension); indexTempFile.deleteOnExit(); OutputStream indexFileStream = new FileOutputStream(indexTempFile); indexFileStream.write(indexStream.toByteArray());