Skip to content

Commit

Permalink
DataSeries/DataReader/DataWriter refactor (#1219)
Browse files Browse the repository at this point in the history
* Code simplification, new tests, and removal of unneeded reflection.
* This makes many breaking changes to the existing cram code.
* Closes #453
  • Loading branch information
jmthibault79 authored and lbergelson committed Nov 7, 2018
1 parent 5208410 commit 4408649
Show file tree
Hide file tree
Showing 36 changed files with 1,405 additions and 1,453 deletions.
42 changes: 2 additions & 40 deletions src/main/java/htsjdk/samtools/cram/CRAIIndex.java
Original file line number Diff line number Diff line change
@@ -1,19 +1,15 @@
package htsjdk.samtools.cram;

import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.CRAMBAIIndexer;
import htsjdk.samtools.CRAMCRAIIndexer;
import htsjdk.samtools.cram.encoding.reader.DataReaderFactory;
import htsjdk.samtools.cram.encoding.reader.RefSeqIdReader;
import htsjdk.samtools.cram.io.DefaultBitInputStream;
import htsjdk.samtools.cram.encoding.reader.MultiRefSliceAlignmentSpanReader;
import htsjdk.samtools.cram.structure.*;
import htsjdk.samtools.seekablestream.SeekableMemoryStream;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.ValidationStringency;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
Expand All @@ -23,7 +19,6 @@
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;

import java.util.List;
Expand Down Expand Up @@ -97,41 +92,8 @@ private static Collection<CRAIEntry> getCRAIEntriesForMultiRefSlice(
final long containerOffset,
final int[] landmarks)
{
final DataReaderFactory dataReaderFactory = new DataReaderFactory();
final Map<Integer, InputStream> inputMap = new HashMap<>();
for (final Integer exId : slice.external.keySet()) {
inputMap.put(exId, new ByteArrayInputStream(slice.external.get(exId).getRawContent()));
}

final RefSeqIdReader reader = new RefSeqIdReader(
slice.sequenceId,
slice.alignmentStart,
ValidationStringency.DEFAULT_STRINGENCY);
dataReaderFactory.buildReader(
reader,
new DefaultBitInputStream(new ByteArrayInputStream(slice.coreBlock.getRawContent())),
inputMap,
header,
slice.sequenceId
);
reader.APDelta = header.APDelta;

for (int i = 0; i < slice.nofRecords; i++) {
final CramCompressionRecord record = new CramCompressionRecord();
record.sliceIndex = slice.index;
record.index = i;

reader.read();

if (record.sequenceId == slice.sequenceId) {
record.sequenceId = slice.sequenceId;
}
else if (record.sequenceId == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
record.sequenceName = SAMRecord.NO_ALIGNMENT_REFERENCE_NAME;
}
}
final Map<Integer, AlignmentSpan> spans = slice.getMultiRefAlignmentSpans(header, ValidationStringency.DEFAULT_STRINGENCY);

Map<Integer, AlignmentSpan> spans = reader.getReferenceSpans();
List<CRAIEntry> entries = new ArrayList<>(spans.size());
for (int seqId : spans.keySet()) {
CRAIEntry e = new CRAIEntry();
Expand Down
113 changes: 60 additions & 53 deletions src/main/java/htsjdk/samtools/cram/build/CompressionHeaderFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
import htsjdk.samtools.cram.encoding.readfeatures.Substitution;
import htsjdk.samtools.cram.structure.CompressionHeader;
import htsjdk.samtools.cram.structure.CramCompressionRecord;
import htsjdk.samtools.cram.structure.EncodingKey;
import htsjdk.samtools.cram.structure.DataSeries;
import htsjdk.samtools.cram.structure.EncodingParams;
import htsjdk.samtools.cram.structure.ReadTag;
import htsjdk.samtools.cram.structure.SubstitutionMatrix;
Expand Down Expand Up @@ -70,47 +70,47 @@ public CompressionHeaderFactory() {
* @param substitutionMatrix
* a matrix of base substitution frequencies, can be null, in
* which case it is re-calculated.
* @param sorted
* @param coordinateSorted
* if true the records are assumed to be sorted by alignment
* position
* @return {@link htsjdk.samtools.cram.structure.CompressionHeader} object
* describing the encoding chosen for the data
*/
public CompressionHeader build(final List<CramCompressionRecord> records, SubstitutionMatrix substitutionMatrix,
final boolean sorted) {
final boolean coordinateSorted) {

final CompressionHeaderBuilder builder = new CompressionHeaderBuilder(sorted);
final CompressionHeaderBuilder builder = new CompressionHeaderBuilder(coordinateSorted);

builder.addExternalIntegerRansOrderZeroEncoding(EncodingKey.AP_AlignmentPositionOffset);
builder.addExternalByteRansOrderOneEncoding(EncodingKey.BA_Base);
builder.addExternalIntegerRansOrderZeroEncoding(DataSeries.AP_AlignmentPositionOffset);
builder.addExternalByteRansOrderOneEncoding(DataSeries.BA_Base);
// BB is not used
builder.addExternalIntegerRansOrderOneEncoding(EncodingKey.BF_BitFlags);
builder.addExternalByteGzipEncoding(EncodingKey.BS_BaseSubstitutionCode);
builder.addExternalIntegerRansOrderOneEncoding(EncodingKey.CF_CompressionBitFlags);
builder.addExternalIntegerGzipEncoding(EncodingKey.DL_DeletionLength);
builder.addExternalByteGzipEncoding(EncodingKey.FC_FeatureCode);
builder.addExternalIntegerGzipEncoding(EncodingKey.FN_NumberOfReadFeatures);
builder.addExternalIntegerGzipEncoding(EncodingKey.FP_FeaturePosition);
builder.addExternalIntegerGzipEncoding(EncodingKey.HC_HardClip);
builder.addExternalByteArrayStopTabGzipEncoding(EncodingKey.IN_Insertion);
builder.addExternalIntegerGzipEncoding(EncodingKey.MF_MateBitFlags);
builder.addExternalIntegerGzipEncoding(EncodingKey.MQ_MappingQualityScore);
builder.addExternalIntegerGzipEncoding(EncodingKey.NF_RecordsToNextFragment);
builder.addExternalIntegerGzipEncoding(EncodingKey.NP_NextFragmentAlignmentStart);
builder.addExternalIntegerRansOrderOneEncoding(EncodingKey.NS_NextFragmentReferenceSequenceID);
builder.addExternalIntegerGzipEncoding(EncodingKey.PD_padding);
builder.addExternalIntegerRansOrderOneEncoding(DataSeries.BF_BitFlags);
builder.addExternalByteGzipEncoding(DataSeries.BS_BaseSubstitutionCode);
builder.addExternalIntegerRansOrderOneEncoding(DataSeries.CF_CompressionBitFlags);
builder.addExternalIntegerGzipEncoding(DataSeries.DL_DeletionLength);
builder.addExternalByteGzipEncoding(DataSeries.FC_FeatureCode);
builder.addExternalIntegerGzipEncoding(DataSeries.FN_NumberOfReadFeatures);
builder.addExternalIntegerGzipEncoding(DataSeries.FP_FeaturePosition);
builder.addExternalIntegerGzipEncoding(DataSeries.HC_HardClip);
builder.addExternalByteArrayStopTabGzipEncoding(DataSeries.IN_Insertion);
builder.addExternalIntegerGzipEncoding(DataSeries.MF_MateBitFlags);
builder.addExternalIntegerGzipEncoding(DataSeries.MQ_MappingQualityScore);
builder.addExternalIntegerGzipEncoding(DataSeries.NF_RecordsToNextFragment);
builder.addExternalIntegerGzipEncoding(DataSeries.NP_NextFragmentAlignmentStart);
builder.addExternalIntegerRansOrderOneEncoding(DataSeries.NS_NextFragmentReferenceSequenceID);
builder.addExternalIntegerGzipEncoding(DataSeries.PD_padding);
// QQ is not used
builder.addExternalByteRansOrderOneEncoding(EncodingKey.QS_QualityScore);
builder.addExternalIntegerRansOrderOneEncoding(EncodingKey.RG_ReadGroup);
builder.addExternalIntegerRansOrderZeroEncoding(EncodingKey.RI_RefId);
builder.addExternalIntegerRansOrderOneEncoding(EncodingKey.RL_ReadLength);
builder.addExternalByteArrayStopTabGzipEncoding(EncodingKey.RN_ReadName);
builder.addExternalIntegerGzipEncoding(EncodingKey.RS_RefSkip);
builder.addExternalByteArrayStopTabGzipEncoding(EncodingKey.SC_SoftClip);
builder.addExternalIntegerGzipEncoding(EncodingKey.TC_TagCount);
builder.addExternalIntegerEncoding(EncodingKey.TL_TagIdList, ExternalCompressor.createGZIP());
builder.addExternalIntegerGzipEncoding(EncodingKey.TN_TagNameAndType);
builder.addExternalIntegerRansOrderOneEncoding(EncodingKey.TS_InsetSize);
builder.addExternalByteRansOrderOneEncoding(DataSeries.QS_QualityScore);
builder.addExternalIntegerRansOrderOneEncoding(DataSeries.RG_ReadGroup);
builder.addExternalIntegerRansOrderZeroEncoding(DataSeries.RI_RefId);
builder.addExternalIntegerRansOrderOneEncoding(DataSeries.RL_ReadLength);
builder.addExternalByteArrayStopTabGzipEncoding(DataSeries.RN_ReadName);
builder.addExternalIntegerGzipEncoding(DataSeries.RS_RefSkip);
builder.addExternalByteArrayStopTabGzipEncoding(DataSeries.SC_SoftClip);
builder.addExternalIntegerGzipEncoding(DataSeries.TC_TagCount);
builder.addExternalIntegerEncoding(DataSeries.TL_TagIdList, ExternalCompressor.createGZIP());
builder.addExternalIntegerGzipEncoding(DataSeries.TN_TagNameAndType);
builder.addExternalIntegerRansOrderOneEncoding(DataSeries.TS_InsetSize);

builder.setTagIdDictionary(buildTagIdDictionary(records));

Expand Down Expand Up @@ -479,56 +479,63 @@ private EncodingDetails buildEncodingForTag(final List<CramCompressionRecord> re
private static class CompressionHeaderBuilder {
private final CompressionHeader header;

CompressionHeaderBuilder(final boolean sorted) {
CompressionHeaderBuilder(final boolean coordinateSorted) {
header = new CompressionHeader();
header.externalIds = new ArrayList<>();
header.tMap = new TreeMap<>();

header.encodingMap = new TreeMap<>();
header.APDelta = sorted;
header.APDelta = coordinateSorted;
}

CompressionHeader getHeader() {
return header;
}

void addExternalEncoding(final EncodingKey encodingKey, final EncodingParams params,
final ExternalCompressor compressor) {
header.externalIds.add(encodingKey.ordinal());
header.externalCompressors.put(encodingKey.ordinal(), compressor);
header.encodingMap.put(encodingKey, params);
private void addExternalEncoding(final DataSeries dataSeries,
final EncodingParams params,
final ExternalCompressor compressor) {
header.externalIds.add(dataSeries.getExternalBlockContentId());
header.externalCompressors.put(dataSeries.getExternalBlockContentId(), compressor);
header.encodingMap.put(dataSeries, params);
}

void addExternalByteArrayStopTabGzipEncoding(final EncodingKey encodingKey) {
addExternalEncoding(encodingKey, ByteArrayStopEncoding.toParam((byte) '\t', encodingKey.ordinal()),
private void addExternalByteArrayStopTabGzipEncoding(final DataSeries dataSeries) {
addExternalEncoding(dataSeries,
ByteArrayStopEncoding.toParam((byte) '\t', dataSeries.getExternalBlockContentId()),
ExternalCompressor.createGZIP());
}

void addExternalIntegerEncoding(final EncodingKey encodingKey, final ExternalCompressor compressor) {
addExternalEncoding(encodingKey, ExternalIntegerEncoding.toParam(encodingKey.ordinal()), compressor);
private void addExternalIntegerEncoding(final DataSeries dataSeries, final ExternalCompressor compressor) {
addExternalEncoding(dataSeries,
ExternalIntegerEncoding.toParam(dataSeries.getExternalBlockContentId()),
compressor);
}

void addExternalIntegerGzipEncoding(final EncodingKey encodingKey) {
addExternalEncoding(encodingKey, ExternalIntegerEncoding.toParam(encodingKey.ordinal()),
private void addExternalIntegerGzipEncoding(final DataSeries dataSeries) {
addExternalEncoding(dataSeries,
ExternalIntegerEncoding.toParam(dataSeries.getExternalBlockContentId()),
ExternalCompressor.createGZIP());
}

void addExternalByteGzipEncoding(final EncodingKey encodingKey) {
addExternalEncoding(encodingKey, ExternalByteEncoding.toParam(encodingKey.ordinal()),
private void addExternalByteGzipEncoding(final DataSeries dataSeries) {
addExternalEncoding(dataSeries,
ExternalByteEncoding.toParam(dataSeries.getExternalBlockContentId()),
ExternalCompressor.createGZIP());
}

void addExternalByteRansOrderOneEncoding(final EncodingKey encodingKey) {
addExternalEncoding(encodingKey, ExternalByteEncoding.toParam(encodingKey.ordinal()),
private void addExternalByteRansOrderOneEncoding(final DataSeries dataSeries) {
addExternalEncoding(dataSeries,
ExternalByteEncoding.toParam(dataSeries.getExternalBlockContentId()),
ExternalCompressor.createRANS(RANS.ORDER.ONE));
}

void addExternalIntegerRansOrderOneEncoding(final EncodingKey encodingKey) {
addExternalIntegerEncoding(encodingKey, ExternalCompressor.createRANS(RANS.ORDER.ONE));
private void addExternalIntegerRansOrderOneEncoding(final DataSeries dataSeries) {
addExternalIntegerEncoding(dataSeries, ExternalCompressor.createRANS(RANS.ORDER.ONE));
}

void addExternalIntegerRansOrderZeroEncoding(final EncodingKey encodingKey) {
addExternalIntegerEncoding(encodingKey, ExternalCompressor.createRANS(RANS.ORDER.ZERO));
private void addExternalIntegerRansOrderZeroEncoding(final DataSeries dataSeries) {
addExternalIntegerEncoding(dataSeries, ExternalCompressor.createRANS(RANS.ORDER.ZERO));
}

void addTagEncoding(final int tagId, final EncodingDetails encodingDetails) {
Expand Down
29 changes: 7 additions & 22 deletions src/main/java/htsjdk/samtools/cram/build/ContainerFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.cram.digest.ContentDigests;
import htsjdk.samtools.cram.encoding.ExternalCompressor;
import htsjdk.samtools.cram.encoding.writer.DataWriterFactory;
import htsjdk.samtools.cram.encoding.writer.Writer;
import htsjdk.samtools.cram.encoding.writer.CramRecordWriter;
import htsjdk.samtools.cram.io.DefaultBitOutputStream;
import htsjdk.samtools.cram.io.ExposedByteArrayOutputStream;
import htsjdk.samtools.cram.structure.Block;
Expand Down Expand Up @@ -60,12 +59,10 @@ Container buildContainer(final List<CramCompressionRecord> records,
final SubstitutionMatrix substitutionMatrix)
throws IllegalArgumentException, IllegalAccessException,
IOException {
// get stats, create compression header and slices
final long time1 = System.nanoTime();
final CompressionHeader header = new CompressionHeaderFactory().build(records,
substitutionMatrix, samFileHeader.getSortOrder() == SAMFileHeader.SortOrder.coordinate);
header.APDelta = samFileHeader.getSortOrder() == SAMFileHeader.SortOrder.coordinate;
final long time2 = System.nanoTime();

// sets header APDelta
final boolean coordinateSorted = samFileHeader.getSortOrder() == SAMFileHeader.SortOrder.coordinate;
final CompressionHeader header = new CompressionHeaderFactory().build(records, substitutionMatrix, coordinateSorted);

header.readNamesIncluded = preserveReadNames;

Expand All @@ -78,7 +75,6 @@ Container buildContainer(final List<CramCompressionRecord> records,
container.bases = 0;
container.blockCount = 0;

final long time3 = System.nanoTime();
long lastGlobalRecordCounter = container.globalRecordCounter;
for (int i = 0; i < records.size(); i += recordsPerSlice) {
final List<CramCompressionRecord> sliceRecords = records.subList(i,
Expand All @@ -94,14 +90,9 @@ Container buildContainer(final List<CramCompressionRecord> records,
container.sequenceId = slice.sequenceId;
}

final long time4 = System.nanoTime();

container.slices = slices.toArray(new Slice[slices.size()]);
calculateAlignmentBoundaries(container);

container.buildHeaderTime = time2 - time1;
container.buildSlicesTime = time4 - time3;

globalRecordCounter += records.size();
return container;
}
Expand Down Expand Up @@ -131,7 +122,6 @@ private static Slice buildSlice(final List<CramCompressionRecord> records,
map.put(id, new ExposedByteArrayOutputStream());
}

final DataWriterFactory dataWriterFactory = new DataWriterFactory();
final ExposedByteArrayOutputStream bitBAOS = new ExposedByteArrayOutputStream();
final DefaultBitOutputStream bitOutputStream = new DefaultBitOutputStream(bitBAOS);

Expand Down Expand Up @@ -175,13 +165,8 @@ private static Slice buildSlice(final List<CramCompressionRecord> records,
slice.alignmentSpan = maxAlEnd - minAlStart + 1;
}

final Writer writer = dataWriterFactory.buildWriter(bitOutputStream, map, header, slice.sequenceId);
int prevAlStart = slice.alignmentStart;
for (final CramCompressionRecord record : records) {
record.alignmentDelta = record.alignmentStart - prevAlStart;
prevAlStart = record.alignmentStart;
writer.write(record);
}
final CramRecordWriter writer = new CramRecordWriter(bitOutputStream, map, header, slice.sequenceId);
writer.writeCramCompressionRecords(records, slice.alignmentStart);

bitOutputStream.close();
slice.coreBlock = Block.buildNewCore(bitBAOS.toByteArray());
Expand Down
Loading

0 comments on commit 4408649

Please sign in to comment.