Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update CramCompressionRecord.isPlaced() to make it APDelta-aware #1284

Merged
merged 20 commits into from
Feb 21, 2019
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/main/java/htsjdk/samtools/CRAMContainerStreamWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -379,8 +379,8 @@ protected void flushContainer() throws IllegalArgumentException, IllegalAccessEx
while (last.next != null) last = last.next;

if (cramRecord.isFirstSegment() && last.isLastSegment()) {

final int templateLength = CramNormalizer.computeInsertSize(cramRecord, last);
final boolean coordinateSorted = (samFileHeader.getSortOrder() == SAMFileHeader.SortOrder.coordinate);
final int templateLength = CramNormalizer.computeInsertSize(cramRecord, last, coordinateSorted);

if (cramRecord.templateSize == templateLength) {
last = cramRecord.next;
Expand Down
16 changes: 10 additions & 6 deletions src/main/java/htsjdk/samtools/cram/build/CramNormalizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ public void normalize(final ArrayList<CramCompressionRecord> records,
for (final CramCompressionRecord record : records) {
if (record.previous != null) continue;
if (record.next == null) continue;
restoreMateInfo(record);
final boolean usePositionDeltaEncoding = header.getSortOrder() == SAMFileHeader.SortOrder.coordinate;
restoreMateInfo(record, usePositionDeltaEncoding);
}
}

Expand Down Expand Up @@ -137,7 +138,8 @@ public void normalize(final ArrayList<CramCompressionRecord> records,
restoreQualityScores(defaultQualityScore, records);
}

private static void restoreMateInfo(final CramCompressionRecord record) {
private static void restoreMateInfo(final CramCompressionRecord record,
final boolean usePositionDeltaEncoding) {
if (record.next == null) {

return;
Expand All @@ -155,7 +157,7 @@ private static void restoreMateInfo(final CramCompressionRecord record) {
// record.setFirstSegment(true);
// last.setLastSegment(true);

final int templateLength = computeInsertSize(record, last);
final int templateLength = computeInsertSize(record, last, usePositionDeltaEncoding);
record.templateSize = templateLength;
last.templateSize = -templateLength;
}
Expand Down Expand Up @@ -326,19 +328,21 @@ private static byte getByteOrDefault(final byte[] array, final int pos,
*
* @param firstEnd first mate of the pair
* @param secondEnd second mate of the pair
* @param usePositionDeltaEncoding do these records delta-encode their alignment starts?
* @return template length
*/
public static int computeInsertSize(final CramCompressionRecord firstEnd,
final CramCompressionRecord secondEnd) {
final CramCompressionRecord secondEnd,
final boolean usePositionDeltaEncoding) {
if (firstEnd.isSegmentUnmapped() || secondEnd.isSegmentUnmapped()) {
return 0;
}
if (firstEnd.sequenceId != secondEnd.sequenceId) {
return 0;
}

final int firstEnd5PrimePosition = firstEnd.isNegativeStrand() ? firstEnd.getAlignmentEnd() : firstEnd.alignmentStart;
final int secondEnd5PrimePosition = secondEnd.isNegativeStrand() ? secondEnd.getAlignmentEnd() : secondEnd.alignmentStart;
final int firstEnd5PrimePosition = firstEnd.isNegativeStrand() ? firstEnd.getAlignmentEnd(usePositionDeltaEncoding) : firstEnd.alignmentStart;
final int secondEnd5PrimePosition = secondEnd.isNegativeStrand() ? secondEnd.getAlignmentEnd(usePositionDeltaEncoding) : secondEnd.alignmentStart;

final int adjustment = (secondEnd5PrimePosition >= firstEnd5PrimePosition) ? +1 : -1;
return secondEnd5PrimePosition - firstEnd5PrimePosition + adjustment;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import htsjdk.samtools.cram.encoding.readfeatures.Insertion;
import htsjdk.samtools.cram.encoding.readfeatures.ReadFeature;
import htsjdk.samtools.cram.encoding.readfeatures.SoftClip;
import htsjdk.samtools.util.Log;

import java.util.Arrays;
import java.util.Collection;
Expand All @@ -49,12 +50,17 @@ public class CramCompressionRecord {
private static final int HAS_MATE_DOWNSTREAM_FLAG = 0x4;
private static final int UNKNOWN_BASES = 0x8;

private static final int UNINITIALIZED_END = -1;
private static final int UNINITIALIZED_SPAN = -1;

private static final Log log = Log.getInstance(CramCompressionRecord.class);

// sequential index of the record in a stream:
public int index = 0;
public int alignmentStart;
public int alignmentDelta;
private int alignmentEnd = -1;
private int alignmentSpan = -1;
private int alignmentEnd = UNINITIALIZED_END;
private int alignmentSpan = UNINITIALIZED_SPAN;

public int readLength;

Expand Down Expand Up @@ -152,20 +158,40 @@ public String toString() {
return stringBuilder.toString();
}

public int getAlignmentSpan() {
if (alignmentSpan < 0) calculateAlignmentBoundaries();
/**
* Check whether alignmentSpan has been initialized, and do so if it has not
* @param usePositionDeltaEncoding is this record's position delta-encoded? used to call isPlaced()
* @return the initialized alignmentSpan
*/
public int getAlignmentSpan(final boolean usePositionDeltaEncoding) {
if (alignmentSpan == UNINITIALIZED_SPAN) {
intializeAlignmentBoundaries(usePositionDeltaEncoding);
}
return alignmentSpan;
}

void calculateAlignmentBoundaries() {
if (isSegmentUnmapped()) {
/**
* Check whether alignmentEnd has been initialized, and do so if it has not
* @param usePositionDeltaEncoding is this record's position delta-encoded? used to call isPlaced()
* @return the initialized alignmentEnd
*/
public int getAlignmentEnd(final boolean usePositionDeltaEncoding) {
if (alignmentEnd == UNINITIALIZED_END) {
intializeAlignmentBoundaries(usePositionDeltaEncoding);
}
return alignmentEnd;
}

private void intializeAlignmentBoundaries(final boolean usePositionDeltaEncoding) {
if (! isPlaced(usePositionDeltaEncoding)) {
alignmentSpan = 0;
alignmentEnd = SAMRecord.NO_ALIGNMENT_START;
} else if (readFeatures == null || readFeatures.isEmpty()) {
alignmentSpan = readLength;
alignmentEnd = alignmentStart + alignmentSpan - 1;
} else {
alignmentSpan = readLength;
return;
}

alignmentSpan = readLength;

if (readFeatures != null) {
for (final ReadFeature readFeature : readFeatures) {
switch (readFeature.getOperator()) {
case InsertBase.operator:
Expand All @@ -185,13 +211,10 @@ void calculateAlignmentBoundaries() {
break;
}
}
alignmentEnd = alignmentStart + alignmentSpan - 1;

}
}

public int getAlignmentEnd() {
if (alignmentEnd < 0) calculateAlignmentBoundaries();
return alignmentEnd;
alignmentEnd = alignmentStart + alignmentSpan - 1;
}

public boolean isMultiFragment() {
Expand All @@ -207,7 +230,7 @@ public void setMultiFragment(final boolean multiFragment) {
* Unmapped records may be stored in the same {@link Slice}s and {@link Container}s as mapped
* records if they are placed.
*
* @see #isPlaced()
* @see #isPlaced(boolean)
* @return true if the record is unmapped
*/
public boolean isSegmentUnmapped() {
Expand All @@ -219,16 +242,42 @@ public void setSegmentUnmapped(final boolean segmentUnmapped) {
}

/**
* Does this record have a valid placement/alignment location? This is independent of mapping status.
* It must have both a valid reference sequence ID and alignment start position to qualify.
* Unplaced reads may only be stored in Unmapped or Multiple Reference {@link Slice}s and {@link Container}s.
* Does this record have a valid placement/alignment location?
*
* It must have a valid reference sequence ID to be considered placed.
* In the case of absolute (non-delta) position encoding, it must also have a
* valid alignment start position, so we need to know if it is delta-encoded.
*
* Normally we expect to see that the unmapped flag is set for unplaced reads,
* so we log a WARNING here if the read is unplaced yet somehow mapped.
*
* @see #isSegmentUnmapped()
* @param usePositionDeltaEncoding is this read's position delta-encoded? if not, check alignment Start.
* @return true if the record is placed
*/
public boolean isPlaced() {
return sequenceId != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX &&
alignmentStart != SAMRecord.NO_ALIGNMENT_START;
public boolean isPlaced(final boolean usePositionDeltaEncoding) {
boolean placed = isPlacedInternal(!usePositionDeltaEncoding);

if (!placed && !isSegmentUnmapped()) {
final String warning = String.format(
"Cram Compression Record [%s] does not have the unmapped flag set, " +
"but also does not have a valid placement on a reference sequence.",
this.toString());
log.warn(warning);
}

return placed;
}

// check placement without regard to mapping; helper method for isPlaced()
private boolean isPlacedInternal(final boolean useAbsolutePositionEncoding) {
// placement requires a valid sequence ID
if (sequenceId == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
return false;
}

// if an absolute alignment start coordinate is required but we have none, it's unplaced
return ! (useAbsolutePositionEncoding && alignmentStart == SAMRecord.NO_ALIGNMENT_START);
}

public boolean isFirstSegment() {
Expand Down
8 changes: 4 additions & 4 deletions src/main/java/htsjdk/samtools/cram/structure/Slice.java
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,7 @@ public static Slice buildSlice(final List<CramCompressionRecord> records,
// Unmapped: all records are unmapped and unplaced

final CramCompressionRecord firstRecord = records.get(0);
slice.sequenceId = firstRecord.isPlaced() ?
slice.sequenceId = firstRecord.isPlaced(header.APDelta) ?
firstRecord.sequenceId :
SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;

Expand All @@ -398,19 +398,19 @@ public static Slice buildSlice(final List<CramCompressionRecord> records,
hasher.add(record);

// flip an unmapped slice to multi-ref if the record is placed
if (slice.isUnmapped() && record.isPlaced()) {
if (slice.isUnmapped() && record.isPlaced(header.APDelta)) {
slice.sequenceId = MULTI_REFERENCE;
}
else if (slice.isMappedSingleRef()) {
// flip a single-ref slice to multi-ref if the record is unplaced or on a different ref
if (!record.isPlaced() || slice.sequenceId != record.sequenceId) {
if (!record.isPlaced(header.APDelta) || slice.sequenceId != record.sequenceId) {
slice.sequenceId = MULTI_REFERENCE;
}
else {
// calculate single ref slice alignment

minAlStart = Math.min(record.alignmentStart, minAlStart);
maxAlEnd = Math.max(record.getAlignmentEnd(), maxAlEnd);
maxAlEnd = Math.max(record.getAlignmentEnd(header.APDelta), maxAlEnd);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.cram.encoding.readfeatures.*;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.util.ArrayList;
Expand All @@ -12,19 +13,24 @@
* Created by vadim on 28/09/2015.
*/
public class CramCompressionRecordTest extends HtsjdkTest {
@Test
public void test_getAlignmentEnd() {
@DataProvider(name = "deltaEncodingTrueFalse")
private Object[][] tf() {
return new Object[][] { {true}, {false}};
}

@Test(dataProvider = "deltaEncodingTrueFalse")
jmthibault79 marked this conversation as resolved.
Show resolved Hide resolved
public void test_getAlignmentEnd(final boolean usePositionDeltaEncoding) {
CramCompressionRecord r = new CramCompressionRecord();
r.alignmentStart = 1;
r.setSegmentUnmapped(true);
Assert.assertEquals(r.getAlignmentEnd(), SAMRecord.NO_ALIGNMENT_START);
Assert.assertEquals(r.getAlignmentEnd(usePositionDeltaEncoding), SAMRecord.NO_ALIGNMENT_START);

r = new CramCompressionRecord();
int readLength = 100;
r.alignmentStart = 1;
r.readLength = readLength;
r.setSegmentUnmapped(false);
Assert.assertEquals(r.getAlignmentEnd(), r.readLength + r.alignmentStart - 1);
Assert.assertEquals(r.getAlignmentEnd(usePositionDeltaEncoding), r.readLength + r.alignmentStart - 1);

r = new CramCompressionRecord();
r.alignmentStart = 1;
Expand All @@ -33,7 +39,7 @@ public void test_getAlignmentEnd() {
r.readFeatures = new ArrayList<>();
String softClip = "AAA";
r.readFeatures.add(new SoftClip(1, softClip.getBytes()));
Assert.assertEquals(r.getAlignmentEnd(), r.readLength + r.alignmentStart - 1 - softClip.length());
Assert.assertEquals(r.getAlignmentEnd(usePositionDeltaEncoding), r.readLength + r.alignmentStart - 1 - softClip.length());

r = new CramCompressionRecord();
r.alignmentStart = 1;
Expand All @@ -42,7 +48,7 @@ public void test_getAlignmentEnd() {
r.readFeatures = new ArrayList<>();
int deletionLength = 5;
r.readFeatures.add(new Deletion(1, deletionLength));
Assert.assertEquals(r.getAlignmentEnd(), r.readLength + r.alignmentStart - 1 + deletionLength);
Assert.assertEquals(r.getAlignmentEnd(usePositionDeltaEncoding), r.readLength + r.alignmentStart - 1 + deletionLength);

r = new CramCompressionRecord();
r.alignmentStart = 1;
Expand All @@ -51,38 +57,48 @@ public void test_getAlignmentEnd() {
r.readFeatures = new ArrayList<>();
String insertion = "CCCCCCCCCC";
r.readFeatures.add(new Insertion(1, insertion.getBytes()));
Assert.assertEquals(r.getAlignmentEnd(), r.readLength + r.alignmentStart - 1 - insertion.length());

Assert.assertEquals(r.getAlignmentEnd(usePositionDeltaEncoding), r.readLength + r.alignmentStart - 1 - insertion.length());

r = new CramCompressionRecord();
r.alignmentStart = 1;
r.readLength = readLength;
r.setSegmentUnmapped(false);
r.readFeatures = new ArrayList<>();
r.readFeatures.add(new InsertBase(1, (byte) 'A'));
Assert.assertEquals(r.getAlignmentEnd(), r.readLength + r.alignmentStart - 1 - 1);
Assert.assertEquals(r.getAlignmentEnd(usePositionDeltaEncoding), r.readLength + r.alignmentStart - 1 - 1);
}

@Test
public void test_isPlaced() {
final CramCompressionRecord r = new CramCompressionRecord();
@DataProvider(name = "placedTests")
private Object[][] placedTests() {
return new Object[][] {
// usePositionDeltaEncoding = false. Must have a valid Start as well as a Reference
{false, false, false, false},
// usePositionDeltaEncoding = true. Must have a valid Reference. Invalid Start is OK because we use the Delta instead
{true, false, false, true}
jmthibault79 marked this conversation as resolved.
Show resolved Hide resolved
};
}

// it's only Placed if both of these are valid
@Test(dataProvider = "placedTests")
public void test_isPlaced(final boolean usePositionDeltaEncoding,
final boolean noRefExpectation,
final boolean bothExpectation,
final boolean noStartExpectation) {
final CramCompressionRecord r = new CramCompressionRecord();

r.sequenceId = 5;
r.alignmentStart = 10;
Assert.assertTrue(r.isPlaced());
Assert.assertTrue(r.isPlaced(usePositionDeltaEncoding));

r.sequenceId = SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;
Assert.assertFalse(r.isPlaced());
Assert.assertEquals(r.isPlaced(usePositionDeltaEncoding), noRefExpectation);

r.alignmentStart = SAMRecord.NO_ALIGNMENT_START;
Assert.assertFalse(r.isPlaced());
Assert.assertEquals(r.isPlaced(usePositionDeltaEncoding), bothExpectation);

r.sequenceId = 3;
Assert.assertFalse(r.isPlaced());
Assert.assertEquals(r.isPlaced(usePositionDeltaEncoding), noStartExpectation);

r.alignmentStart = 15;
Assert.assertTrue(r.isPlaced());
Assert.assertTrue(r.isPlaced(usePositionDeltaEncoding));
}
}
Loading