Skip to content

Commit

Permalink
Follow-up to #1276: Remove multiple versions of Slice/Container getCR…
Browse files Browse the repository at this point in the history
…AIEntries() (#1329)

* Add bai and crai index init checks to Slice
* Remove multiple versions of Slice/Container getCRAIEntries()
* move Container.setByteOffset() call inside ContainerIO.readContainer()
* populateSlicesAndIndexingParameters -> distributeIndexingParametersToSlices
- called on Container write now as well
* Change setSlices signature to take a List<Slice>
* rm duplicate test code
  • Loading branch information
jmthibault79 authored Apr 1, 2019
1 parent 9f5d86e commit aa89809
Show file tree
Hide file tree
Showing 28 changed files with 738 additions and 444 deletions.
2 changes: 1 addition & 1 deletion src/main/java/htsjdk/samtools/BAMIndexMetaData.java
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ void recordMetaData(final Slice slice) {
unAlignedRecords += slice.unmappedReadsCount;
}

final long start = slice.offset;
final long start = slice.byteOffsetFromContainer;

if (BlockCompressedFilePointerUtil.compare(start, firstOffset) < 1 || firstOffset == -1) {
this.firstOffset = start;
Expand Down
109 changes: 58 additions & 51 deletions src/main/java/htsjdk/samtools/CRAMBAIIndexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@
*/
package htsjdk.samtools;

import htsjdk.samtools.cram.build.ContainerParser;
import htsjdk.samtools.cram.CRAIEntry;
import htsjdk.samtools.cram.CRAIIndex;
import htsjdk.samtools.cram.build.CramIO;
import htsjdk.samtools.cram.ref.ReferenceContext;
import htsjdk.samtools.cram.structure.AlignmentSpan;
Expand All @@ -52,7 +53,7 @@
import htsjdk.samtools.util.ProgressLogger;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Arrays;
import java.util.List;
Expand All @@ -61,11 +62,22 @@

/**
* Class for both constructing BAM index content and writing it out.
*
* There are two usage patterns:
* 1) Building a bam index from an existing cram file
* 2) Building a bam index while building the cram file
* In both cases, processAlignment is called for each cram slice and
* finish() is called at the end.
* 1) Building a bam index (BAI) while building the CRAM file
* 2) Building a bam index (BAI) from an existing CRAI file
*
* 1) is driven by {@link CRAMContainerStreamWriter} and proceeds by calling {@link CRAMBAIIndexer#processContainer}
* after each {@link Container} is built, and {@link CRAMBAIIndexer#finish()} is called at the end.
*
* 2) is driven by {@link CRAIIndex#openCraiFileAsBaiStream(InputStream, SAMSequenceDictionary)}
* and proceeds by processing {@link CRAIEntry} elements obtained from
* {@link CRAMCRAIIndexer#readIndex(InputStream)}. {@link CRAMBAIIndexer#processAsSingleReferenceSlice(Slice)}
* is called on each {@link CRAIEntry} and {@link CRAMBAIIndexer#finish()} is called at the end.
*
* NOTE: a third pattern of building a BAI from a CRAM file is also supported by this class,
* but it is unused. This would be accomplished via {@link #createIndex(SeekableStream, File, Log, ValidationStringency)}.
*
*/
public class CRAMBAIIndexer {

Expand All @@ -78,7 +90,7 @@ public class CRAMBAIIndexer {
private int currentReference = 0;

// content is built up from the input bam file using this
private final BAMIndexBuilder indexBuilder;
private final CRAMBAIIndexBuilder indexBuilder;

/**
* Create a CRAM indexer that writes BAI to a file.
Expand All @@ -91,7 +103,7 @@ private CRAMBAIIndexer(final File output, final SAMFileHeader fileHeader) {
throw new SAMException("CRAM file must be coordinate-sorted for indexing.");
}
numReferences = fileHeader.getSequenceDictionary().size();
indexBuilder = new BAMIndexBuilder(fileHeader);
indexBuilder = new CRAMBAIIndexBuilder(fileHeader);
outputWriter = new BinaryBAMIndexWriter(numReferences, output);
}

Expand All @@ -106,7 +118,7 @@ public CRAMBAIIndexer(final OutputStream output, final SAMFileHeader fileHeader)
throw new SAMException("CRAM file must be coordinate-sorted for indexing.");
}
numReferences = fileHeader.getSequenceDictionary().size();
indexBuilder = new BAMIndexBuilder(fileHeader);
indexBuilder = new CRAMBAIIndexBuilder(fileHeader);
outputWriter = new BinaryBAMIndexWriter(numReferences, output);
}

Expand All @@ -118,21 +130,19 @@ public CRAMBAIIndexer(final OutputStream output, final SAMFileHeader fileHeader)
*
* @param container container to be indexed
*/
public void processContainer(final Container container, final ValidationStringency validationStringency) {
void processContainer(final Container container, final ValidationStringency validationStringency) {
if (container == null || container.isEOF()) {
return;
}

int sliceIndex = 0;
for (final Slice slice : container.slices) {
slice.containerOffset = container.offset;
for (final Slice slice : container.getSlices()) {
slice.index = sliceIndex++;
if (slice.getReferenceContext().isMultiRef()) {
final Map<ReferenceContext, AlignmentSpan> spanMap = container.getSpans(validationStringency);

// TODO why are we updating the original slice here?

slice.containerOffset = container.offset;
slice.index = sliceIndex++;

/**
Expand All @@ -142,8 +152,8 @@ public void processContainer(final Container container, final ValidationStringen
for (final ReferenceContext refContext : new TreeSet<>(spanMap.keySet())) {
final AlignmentSpan span = spanMap.get(refContext);
final Slice fakeSlice = new Slice(refContext);
fakeSlice.containerOffset = slice.containerOffset;
fakeSlice.offset = slice.offset;
fakeSlice.containerByteOffset = slice.containerByteOffset;
fakeSlice.byteOffsetFromContainer = slice.byteOffsetFromContainer;
fakeSlice.index = slice.index;

fakeSlice.alignmentStart = span.getStart();
Expand All @@ -156,8 +166,8 @@ public void processContainer(final Container container, final ValidationStringen

if (unmappedSpan != null) {
final Slice fakeSlice = new Slice(ReferenceContext.UNMAPPED_UNPLACED_CONTEXT);
fakeSlice.containerOffset = slice.containerOffset;
fakeSlice.offset = slice.offset;
fakeSlice.containerByteOffset = slice.containerByteOffset;
fakeSlice.byteOffsetFromContainer = slice.byteOffsetFromContainer;
fakeSlice.index = slice.index;

fakeSlice.alignmentStart = SAMRecord.NO_ALIGNMENT_START;
Expand All @@ -183,6 +193,9 @@ public void processContainer(final Container container, final ValidationStringen
* @throws htsjdk.samtools.SAMException if slice refers to multiple reference sequences.
*/
public void processAsSingleReferenceSlice(final Slice slice) {
// validate that this Slice is ready to be indexed
slice.baiIndexInitializationCheck();

final ReferenceContext sliceContext = slice.getReferenceContext();
if (sliceContext.isMultiRef()) {
throw new SAMException("Expecting a single reference or unmapped slice.");
Expand Down Expand Up @@ -250,40 +263,31 @@ public static void createIndex(final SeekableStream stream,
}
final CRAMBAIIndexer indexer = new CRAMBAIIndexer(output, cramHeader.getSamFileHeader());

int totalRecords = 0;
Container container = null;
ProgressLogger progressLogger = new ProgressLogger(log, 1, "indexed", "slices");
do {
try {
final long offset = stream.position();
container = ContainerIO.readContainer(cramHeader.getVersion(), stream);
if (container == null || container.isEOF()) {
break;
}
container = ContainerIO.readContainer(cramHeader.getVersion(), stream);
if (container == null || container.isEOF()) {
break;
}

container.offset = offset;

indexer.processContainer(container, validationStringency);

if (null != log) {
String sequenceName;
final ReferenceContext containerContext = container.getReferenceContext();
switch (containerContext.getType()) {
case UNMAPPED_UNPLACED_TYPE:
sequenceName = "?";
break;
case MULTIPLE_REFERENCE_TYPE:
sequenceName = "???";
break;
default:
sequenceName = cramHeader.getSamFileHeader().getSequence(containerContext.getSequenceId()).getSequenceName();
break;
}
progressLogger.record(sequenceName, container.alignmentStart);
indexer.processContainer(container, validationStringency);

if (null != log) {
String sequenceName;
final ReferenceContext containerContext = container.getReferenceContext();
switch (containerContext.getType()) {
case UNMAPPED_UNPLACED_TYPE:
sequenceName = "?";
break;
case MULTIPLE_REFERENCE_TYPE:
sequenceName = "???";
break;
default:
sequenceName = cramHeader.getSamFileHeader().getSequence(containerContext.getSequenceId()).getSequenceName();
break;
}

} catch (final IOException e) {
throw new RuntimeException("Failed to read cram container", e);
progressLogger.record(sequenceName, container.alignmentStart);
}

} while (!container.isEOF());
Expand All @@ -296,8 +300,11 @@ public static void createIndex(final SeekableStream stream,
* One instance is used to construct an entire index.
* processAlignment is called for each alignment until a new reference is encountered, then
* processReference is called when all records for the reference have been processed.
*
* TODO: this was copied from BAMIndexer.BAMIndexBuilder
* so perhaps they should be merged back together
*/
private class BAMIndexBuilder {
private class CRAMBAIIndexBuilder {

private final SAMFileHeader bamHeader;

Expand All @@ -315,7 +322,7 @@ private class BAMIndexBuilder {
/**
* @param header SAMFileHeader used for reference name (in index stats) and for max bin number
*/
private BAMIndexBuilder(final SAMFileHeader header) {
private CRAMBAIIndexBuilder(final SAMFileHeader header) {
this.bamHeader = header;
}

Expand Down Expand Up @@ -353,7 +360,7 @@ private int computeIndexingBin(final Slice slice) {
* Record any index information for a given CRAM slice
*
* Reads these Slice fields:
* sequenceId, alignmentStart, alignmentSpan, containerOffset, index
* sequenceId, alignmentStart, alignmentSpan, containerByteOffset, index
*
* @param slice CRAM slice, single ref only.
*/
Expand Down Expand Up @@ -396,8 +403,8 @@ private void processSingleReferenceSlice(final Slice slice) {

// process chunks

final long chunkStart = (slice.containerOffset << 16) | slice.index;
final long chunkEnd = ((slice.containerOffset << 16) | slice.index) + 1;
final long chunkStart = (slice.containerByteOffset << 16) | slice.index;
final long chunkEnd = ((slice.containerByteOffset << 16) | slice.index) + 1;

final Chunk newChunk = new Chunk(chunkStart, chunkEnd);

Expand Down
31 changes: 11 additions & 20 deletions src/main/java/htsjdk/samtools/CRAMCRAIIndexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import java.io.OutputStream;
import java.io.IOException;
import java.util.Collection;
import java.util.List;
import java.util.Scanner;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
Expand Down Expand Up @@ -94,27 +93,19 @@ public void finish() {
* @param craiStream stream for output index
*/
public static void writeIndex(final SeekableStream cramStream, OutputStream craiStream) {
try {
final CramHeader cramHeader = CramIO.readCramHeader(cramStream);
final CRAMCRAIIndexer indexer = new CRAMCRAIIndexer(craiStream, cramHeader.getSamFileHeader());
final Version cramVersion = cramHeader.getVersion();

// get the first container and its offset
long offset = cramStream.position();
Container container = ContainerIO.readContainer(cramVersion, cramStream);

while (container != null && !container.isEOF()) {
container.offset = offset;
indexer.processContainer(container);
offset = cramStream.position();
container = ContainerIO.readContainer(cramVersion, cramStream);
}
final CramHeader cramHeader = CramIO.readCramHeader(cramStream);
final CRAMCRAIIndexer indexer = new CRAMCRAIIndexer(craiStream, cramHeader.getSamFileHeader());
final Version cramVersion = cramHeader.getVersion();

indexer.finish();
}
catch (IOException e) {
throw new RuntimeIOException("Error writing CRAI index to output stream");
// get the first container
Container container = ContainerIO.readContainer(cramVersion, cramStream);

while (container != null && !container.isEOF()) {
indexer.processContainer(container);
container = ContainerIO.readContainer(cramVersion, cramStream);
}

indexer.finish();
}

/**
Expand Down
5 changes: 2 additions & 3 deletions src/main/java/htsjdk/samtools/CRAMContainerStreamWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -442,11 +442,10 @@ protected void flushContainer() throws IllegalArgumentException {
}
}

final Container container = containerFactory.buildContainer(cramRecords);
for (final Slice slice : container.slices) {
final Container container = containerFactory.buildContainer(cramRecords, offset);
for (final Slice slice : container.getSlices()) {
slice.setRefMD5(referenceBases);
}
container.offset = offset;
offset += ContainerIO.writeContainer(cramVersion, container, outputStream);
if (indexer != null) {
/**
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/htsjdk/samtools/CRAMFileReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import htsjdk.samtools.cram.ref.CRAMReferenceSource;
import htsjdk.samtools.cram.ref.ReferenceSource;
import htsjdk.samtools.cram.structure.Container;
import htsjdk.samtools.cram.structure.ContainerIO;
import htsjdk.samtools.cram.structure.ContainerHeaderIO;
import htsjdk.samtools.seekablestream.SeekableFileStream;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.util.CloseableIterator;
Expand Down Expand Up @@ -400,7 +400,7 @@ public CloseableIterator<SAMRecord> queryUnmapped() {
seekableStream.seek(0);
newIterator = new CRAMIterator(seekableStream, referenceSource, validationStringency);
seekableStream.seek(startOfLastLinearBin >>> 16);
final Container container = ContainerIO.readContainerHeader(newIterator.getCramHeader().getVersion().major, seekableStream);
final Container container = ContainerHeaderIO.readContainerHeader(newIterator.getCramHeader().getVersion().major, seekableStream);
seekableStream.seek(seekableStream.position() + container.containerByteSize);
iterator = newIterator;
boolean atAlignments;
Expand Down
7 changes: 3 additions & 4 deletions src/main/java/htsjdk/samtools/CRAMIterator.java
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,7 @@ void nextContainer() throws IllegalArgumentException, CRAMException {
}
}

for (int i = 0; i < container.slices.length; i++) {
final Slice slice = container.slices[i];
for (final Slice slice : container.getSlices()) {
final ReferenceContext sliceContext = slice.getReferenceContext();

if (! sliceContext.isMappedSingleRef())
Expand Down Expand Up @@ -212,8 +211,8 @@ void nextContainer() throws IllegalArgumentException, CRAMException {
samRecord.setValidationStringency(validationStringency);

if (mReader != null) {
final long chunkStart = (container.offset << 16) | cramRecord.sliceIndex;
final long chunkEnd = ((container.offset << 16) | cramRecord.sliceIndex) + 1;
final long chunkStart = (container.byteOffset << 16) | cramRecord.sliceIndex;
final long chunkEnd = ((container.byteOffset << 16) | cramRecord.sliceIndex) + 1;
samRecord.setFileSource(new SAMFileSource(mReader, new BAMFileSpan(new Chunk(chunkStart, chunkEnd))));
}

Expand Down
4 changes: 2 additions & 2 deletions src/main/java/htsjdk/samtools/cram/CRAIEntry.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ public class CRAIEntry implements Comparable<CRAIEntry> {
private final int alignmentSpan;

// this Slice's Container's offset in bytes from the beginning of the stream
// equal to Slice.containerOffset and Container.offset
// equal to Slice.containerByteOffset and Container.byteOffset
private final long containerStartByteOffset;
// this Slice's offset in bytes from the beginning of its Container
// equal to Slice.offset and Container.landmarks[Slice.index]
// equal to Slice.byteOffsetFromContainer and Container.landmarks[Slice.index]
private final int sliceByteOffset;
private final int sliceByteSize;

Expand Down
Loading

0 comments on commit aa89809

Please sign in to comment.