-
Notifications
You must be signed in to change notification settings - Fork 242
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Follow-up to #1276: Remove multiple versions of Slice/Container getCRAIEntries() #1329
Changes from all commits
f1b63fe
c031882
d89b341
1e1f348
5732bae
210c74c
6738967
0f3ed2d
13be7ff
e2a23ae
f82ea60
f05a6cb
e9d009f
c334167
18b8262
4d5dff8
9bcc358
325ab2a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,7 +38,8 @@ | |
*/ | ||
package htsjdk.samtools; | ||
|
||
import htsjdk.samtools.cram.build.ContainerParser; | ||
import htsjdk.samtools.cram.CRAIEntry; | ||
import htsjdk.samtools.cram.CRAIIndex; | ||
import htsjdk.samtools.cram.build.CramIO; | ||
import htsjdk.samtools.cram.ref.ReferenceContext; | ||
import htsjdk.samtools.cram.structure.AlignmentSpan; | ||
|
@@ -52,7 +53,7 @@ | |
import htsjdk.samtools.util.ProgressLogger; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.io.OutputStream; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
|
@@ -61,11 +62,22 @@ | |
|
||
/** | ||
* Class for both constructing BAM index content and writing it out. | ||
* | ||
* There are two usage patterns: | ||
* 1) Building a bam index from an existing cram file | ||
* 2) Building a bam index while building the cram file | ||
* In both cases, processAlignment is called for each cram slice and | ||
* finish() is called at the end. | ||
* 1) Building a bam index (BAI) while building the CRAM file | ||
* 2) Building a bam index (BAI) from an existing CRAI file | ||
* | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is a third pattern implemented in this file, which is creating an index from an existing CRAM file (see There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right - that's why I didn't include it. I can add it here with a note that it's unused. |
||
* 1) is driven by {@link CRAMContainerStreamWriter} and proceeds by calling {@link CRAMBAIIndexer#processContainer} | ||
* after each {@link Container} is built, and {@link CRAMBAIIndexer#finish()} is called at the end. | ||
* | ||
* 2) is driven by {@link CRAIIndex#openCraiFileAsBaiStream(InputStream, SAMSequenceDictionary)} | ||
* and proceeds by processing {@link CRAIEntry} elements obtained from | ||
* {@link CRAMCRAIIndexer#readIndex(InputStream)}. {@link CRAMBAIIndexer#processAsSingleReferenceSlice(Slice)} | ||
* is called on each {@link CRAIEntry} and {@link CRAMBAIIndexer#finish()} is called at the end. | ||
* | ||
* NOTE: a third pattern of building a BAI from a CRAM file is also supported by this class, | ||
* but it is unused. This would be accomplished via {@link #createIndex(SeekableStream, File, Log, ValidationStringency)}. | ||
* | ||
*/ | ||
public class CRAMBAIIndexer { | ||
|
||
|
@@ -78,7 +90,7 @@ public class CRAMBAIIndexer { | |
private int currentReference = 0; | ||
|
||
// content is built up from the input bam file using this | ||
private final BAMIndexBuilder indexBuilder; | ||
private final CRAMBAIIndexBuilder indexBuilder; | ||
|
||
/** | ||
* Create a CRAM indexer that writes BAI to a file. | ||
|
@@ -91,7 +103,7 @@ private CRAMBAIIndexer(final File output, final SAMFileHeader fileHeader) { | |
throw new SAMException("CRAM file must be coordinate-sorted for indexing."); | ||
} | ||
numReferences = fileHeader.getSequenceDictionary().size(); | ||
indexBuilder = new BAMIndexBuilder(fileHeader); | ||
indexBuilder = new CRAMBAIIndexBuilder(fileHeader); | ||
outputWriter = new BinaryBAMIndexWriter(numReferences, output); | ||
} | ||
|
||
|
@@ -106,7 +118,7 @@ public CRAMBAIIndexer(final OutputStream output, final SAMFileHeader fileHeader) | |
throw new SAMException("CRAM file must be coordinate-sorted for indexing."); | ||
} | ||
numReferences = fileHeader.getSequenceDictionary().size(); | ||
indexBuilder = new BAMIndexBuilder(fileHeader); | ||
indexBuilder = new CRAMBAIIndexBuilder(fileHeader); | ||
outputWriter = new BinaryBAMIndexWriter(numReferences, output); | ||
} | ||
|
||
|
@@ -118,21 +130,19 @@ public CRAMBAIIndexer(final OutputStream output, final SAMFileHeader fileHeader) | |
* | ||
* @param container container to be indexed | ||
*/ | ||
public void processContainer(final Container container, final ValidationStringency validationStringency) { | ||
void processContainer(final Container container, final ValidationStringency validationStringency) { | ||
if (container == null || container.isEOF()) { | ||
return; | ||
} | ||
|
||
int sliceIndex = 0; | ||
for (final Slice slice : container.slices) { | ||
slice.containerOffset = container.offset; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is now set elsewhere There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
for (final Slice slice : container.getSlices()) { | ||
slice.index = sliceIndex++; | ||
if (slice.getReferenceContext().isMultiRef()) { | ||
final Map<ReferenceContext, AlignmentSpan> spanMap = container.getSpans(validationStringency); | ||
|
||
// TODO why are we updating the original slice here? | ||
|
||
slice.containerOffset = container.offset; | ||
slice.index = sliceIndex++; | ||
|
||
/** | ||
|
@@ -142,8 +152,8 @@ public void processContainer(final Container container, final ValidationStringen | |
for (final ReferenceContext refContext : new TreeSet<>(spanMap.keySet())) { | ||
final AlignmentSpan span = spanMap.get(refContext); | ||
final Slice fakeSlice = new Slice(refContext); | ||
fakeSlice.containerOffset = slice.containerOffset; | ||
fakeSlice.offset = slice.offset; | ||
fakeSlice.containerByteOffset = slice.containerByteOffset; | ||
fakeSlice.byteOffsetFromContainer = slice.byteOffsetFromContainer; | ||
fakeSlice.index = slice.index; | ||
|
||
fakeSlice.alignmentStart = span.getStart(); | ||
|
@@ -156,8 +166,8 @@ public void processContainer(final Container container, final ValidationStringen | |
|
||
if (unmappedSpan != null) { | ||
final Slice fakeSlice = new Slice(ReferenceContext.UNMAPPED_UNPLACED_CONTEXT); | ||
fakeSlice.containerOffset = slice.containerOffset; | ||
fakeSlice.offset = slice.offset; | ||
fakeSlice.containerByteOffset = slice.containerByteOffset; | ||
fakeSlice.byteOffsetFromContainer = slice.byteOffsetFromContainer; | ||
fakeSlice.index = slice.index; | ||
|
||
fakeSlice.alignmentStart = SAMRecord.NO_ALIGNMENT_START; | ||
|
@@ -183,6 +193,9 @@ public void processContainer(final Container container, final ValidationStringen | |
* @throws htsjdk.samtools.SAMException if slice refers to multiple reference sequences. | ||
*/ | ||
public void processAsSingleReferenceSlice(final Slice slice) { | ||
// validate that this Slice is ready to be indexed | ||
slice.baiIndexInitializationCheck(); | ||
|
||
final ReferenceContext sliceContext = slice.getReferenceContext(); | ||
if (sliceContext.isMultiRef()) { | ||
throw new SAMException("Expecting a single reference or unmapped slice."); | ||
|
@@ -250,40 +263,31 @@ public static void createIndex(final SeekableStream stream, | |
} | ||
final CRAMBAIIndexer indexer = new CRAMBAIIndexer(output, cramHeader.getSamFileHeader()); | ||
|
||
int totalRecords = 0; | ||
Container container = null; | ||
ProgressLogger progressLogger = new ProgressLogger(log, 1, "indexed", "slices"); | ||
do { | ||
try { | ||
final long offset = stream.position(); | ||
container = ContainerIO.readContainer(cramHeader.getVersion(), stream); | ||
if (container == null || container.isEOF()) { | ||
break; | ||
} | ||
container = ContainerIO.readContainer(cramHeader.getVersion(), stream); | ||
if (container == null || container.isEOF()) { | ||
break; | ||
} | ||
|
||
container.offset = offset; | ||
|
||
indexer.processContainer(container, validationStringency); | ||
|
||
if (null != log) { | ||
String sequenceName; | ||
final ReferenceContext containerContext = container.getReferenceContext(); | ||
switch (containerContext.getType()) { | ||
case UNMAPPED_UNPLACED_TYPE: | ||
sequenceName = "?"; | ||
break; | ||
case MULTIPLE_REFERENCE_TYPE: | ||
sequenceName = "???"; | ||
break; | ||
default: | ||
sequenceName = cramHeader.getSamFileHeader().getSequence(containerContext.getSequenceId()).getSequenceName(); | ||
break; | ||
} | ||
progressLogger.record(sequenceName, container.alignmentStart); | ||
indexer.processContainer(container, validationStringency); | ||
|
||
if (null != log) { | ||
String sequenceName; | ||
final ReferenceContext containerContext = container.getReferenceContext(); | ||
switch (containerContext.getType()) { | ||
case UNMAPPED_UNPLACED_TYPE: | ||
sequenceName = "?"; | ||
break; | ||
case MULTIPLE_REFERENCE_TYPE: | ||
sequenceName = "???"; | ||
break; | ||
default: | ||
sequenceName = cramHeader.getSamFileHeader().getSequence(containerContext.getSequenceId()).getSequenceName(); | ||
break; | ||
} | ||
|
||
} catch (final IOException e) { | ||
throw new RuntimeException("Failed to read cram container", e); | ||
progressLogger.record(sequenceName, container.alignmentStart); | ||
} | ||
|
||
} while (!container.isEOF()); | ||
|
@@ -296,8 +300,11 @@ public static void createIndex(final SeekableStream stream, | |
* One instance is used to construct an entire index. | ||
* processAlignment is called for each alignment until a new reference is encountered, then | ||
* processReference is called when all records for the reference have been processed. | ||
* | ||
* TODO: this was copied from BAMIndexer.BAMIndexBuilder | ||
* so perhaps they should be merged back together | ||
*/ | ||
private class BAMIndexBuilder { | ||
private class CRAMBAIIndexBuilder { | ||
|
||
private final SAMFileHeader bamHeader; | ||
|
||
|
@@ -315,7 +322,7 @@ private class BAMIndexBuilder { | |
/** | ||
* @param header SAMFileHeader used for reference name (in index stats) and for max bin number | ||
*/ | ||
private BAMIndexBuilder(final SAMFileHeader header) { | ||
private CRAMBAIIndexBuilder(final SAMFileHeader header) { | ||
this.bamHeader = header; | ||
} | ||
|
||
|
@@ -353,7 +360,7 @@ private int computeIndexingBin(final Slice slice) { | |
* Record any index information for a given CRAM slice | ||
* | ||
* Reads these Slice fields: | ||
* sequenceId, alignmentStart, alignmentSpan, containerOffset, index | ||
* sequenceId, alignmentStart, alignmentSpan, containerByteOffset, index | ||
* | ||
* @param slice CRAM slice, single ref only. | ||
*/ | ||
|
@@ -396,8 +403,8 @@ private void processSingleReferenceSlice(final Slice slice) { | |
|
||
// process chunks | ||
|
||
final long chunkStart = (slice.containerOffset << 16) | slice.index; | ||
final long chunkEnd = ((slice.containerOffset << 16) | slice.index) + 1; | ||
final long chunkStart = (slice.containerByteOffset << 16) | slice.index; | ||
final long chunkEnd = ((slice.containerByteOffset << 16) | slice.index) + 1; | ||
|
||
final Chunk newChunk = new Chunk(chunkStart, chunkEnd); | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,7 +13,6 @@ | |
import java.io.OutputStream; | ||
import java.io.IOException; | ||
import java.util.Collection; | ||
import java.util.List; | ||
import java.util.Scanner; | ||
import java.util.zip.GZIPInputStream; | ||
import java.util.zip.GZIPOutputStream; | ||
|
@@ -94,27 +93,19 @@ public void finish() { | |
* @param craiStream stream for output index | ||
*/ | ||
public static void writeIndex(final SeekableStream cramStream, OutputStream craiStream) { | ||
try { | ||
final CramHeader cramHeader = CramIO.readCramHeader(cramStream); | ||
final CRAMCRAIIndexer indexer = new CRAMCRAIIndexer(craiStream, cramHeader.getSamFileHeader()); | ||
final Version cramVersion = cramHeader.getVersion(); | ||
|
||
// get the first container and its offset | ||
long offset = cramStream.position(); | ||
Container container = ContainerIO.readContainer(cramVersion, cramStream); | ||
|
||
while (container != null && !container.isEOF()) { | ||
container.offset = offset; | ||
indexer.processContainer(container); | ||
offset = cramStream.position(); | ||
container = ContainerIO.readContainer(cramVersion, cramStream); | ||
} | ||
final CramHeader cramHeader = CramIO.readCramHeader(cramStream); | ||
final CRAMCRAIIndexer indexer = new CRAMCRAIIndexer(craiStream, cramHeader.getSamFileHeader()); | ||
final Version cramVersion = cramHeader.getVersion(); | ||
|
||
indexer.finish(); | ||
} | ||
catch (IOException e) { | ||
throw new RuntimeIOException("Error writing CRAI index to output stream"); | ||
// get the first container | ||
Container container = ContainerIO.readContainer(cramVersion, cramStream); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. offset is set inside |
||
|
||
while (container != null && !container.isEOF()) { | ||
indexer.processContainer(container); | ||
container = ContainerIO.readContainer(cramVersion, cramStream); | ||
} | ||
|
||
indexer.finish(); | ||
} | ||
|
||
/** | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,7 +21,7 @@ | |
import htsjdk.samtools.cram.ref.CRAMReferenceSource; | ||
import htsjdk.samtools.cram.ref.ReferenceSource; | ||
import htsjdk.samtools.cram.structure.Container; | ||
import htsjdk.samtools.cram.structure.ContainerIO; | ||
import htsjdk.samtools.cram.structure.ContainerHeaderIO; | ||
import htsjdk.samtools.seekablestream.SeekableFileStream; | ||
import htsjdk.samtools.seekablestream.SeekableStream; | ||
import htsjdk.samtools.util.CloseableIterator; | ||
|
@@ -400,7 +400,7 @@ public CloseableIterator<SAMRecord> queryUnmapped() { | |
seekableStream.seek(0); | ||
newIterator = new CRAMIterator(seekableStream, referenceSource, validationStringency); | ||
seekableStream.seek(startOfLastLinearBin >>> 16); | ||
final Container container = ContainerIO.readContainerHeader(newIterator.getCramHeader().getVersion().major, seekableStream); | ||
final Container container = ContainerHeaderIO.readContainerHeader(newIterator.getCramHeader().getVersion().major, seekableStream); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this was driving me nuts There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TODO obviously: move this to a new |
||
seekableStream.seek(seekableStream.position() + container.containerByteSize); | ||
iterator = newIterator; | ||
boolean atAlignments; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
renamed the Slice indexing parameters to make their function clearer.