diff --git a/src/main/java/htsjdk/samtools/CRAMContainerStreamWriter.java b/src/main/java/htsjdk/samtools/CRAMContainerStreamWriter.java index c588bdb46d..a4a9b7832a 100644 --- a/src/main/java/htsjdk/samtools/CRAMContainerStreamWriter.java +++ b/src/main/java/htsjdk/samtools/CRAMContainerStreamWriter.java @@ -379,8 +379,8 @@ protected void flushContainer() throws IllegalArgumentException, IllegalAccessEx while (last.next != null) last = last.next; if (cramRecord.isFirstSegment() && last.isLastSegment()) { - - final int templateLength = CramNormalizer.computeInsertSize(cramRecord, last); + final boolean coordinateSorted = (samFileHeader.getSortOrder() == SAMFileHeader.SortOrder.coordinate); + final int templateLength = CramNormalizer.computeInsertSize(cramRecord, last, coordinateSorted); if (cramRecord.templateSize == templateLength) { last = cramRecord.next; diff --git a/src/main/java/htsjdk/samtools/cram/build/CramNormalizer.java b/src/main/java/htsjdk/samtools/cram/build/CramNormalizer.java index b2dd67c448..842b31129f 100644 --- a/src/main/java/htsjdk/samtools/cram/build/CramNormalizer.java +++ b/src/main/java/htsjdk/samtools/cram/build/CramNormalizer.java @@ -94,7 +94,8 @@ public void normalize(final ArrayList records, for (final CramCompressionRecord record : records) { if (record.previous != null) continue; if (record.next == null) continue; - restoreMateInfo(record); + final boolean usePositionDeltaEncoding = header.getSortOrder() == SAMFileHeader.SortOrder.coordinate; + restoreMateInfo(record, usePositionDeltaEncoding); } } @@ -137,7 +138,8 @@ public void normalize(final ArrayList records, restoreQualityScores(defaultQualityScore, records); } - private static void restoreMateInfo(final CramCompressionRecord record) { + private static void restoreMateInfo(final CramCompressionRecord record, + final boolean usePositionDeltaEncoding) { if (record.next == null) { return; @@ -155,7 +157,7 @@ private static void restoreMateInfo(final CramCompressionRecord record) { // record.setFirstSegment(true); // last.setLastSegment(true); - final int templateLength = computeInsertSize(record, last); + final int templateLength = computeInsertSize(record, last, usePositionDeltaEncoding); record.templateSize = templateLength; last.templateSize = -templateLength; } @@ -326,10 +328,12 @@ private static byte getByteOrDefault(final byte[] array, final int pos, * * @param firstEnd first mate of the pair * @param secondEnd second mate of the pair + * @param usePositionDeltaEncoding do these records delta-encode their alignment starts? * @return template length */ public static int computeInsertSize(final CramCompressionRecord firstEnd, - final CramCompressionRecord secondEnd) { + final CramCompressionRecord secondEnd, + final boolean usePositionDeltaEncoding) { if (firstEnd.isSegmentUnmapped() || secondEnd.isSegmentUnmapped()) { return 0; } @@ -337,8 +341,8 @@ public static int computeInsertSize(final CramCompressionRecord firstEnd, return 0; } - final int firstEnd5PrimePosition = firstEnd.isNegativeStrand() ? firstEnd.getAlignmentEnd() : firstEnd.alignmentStart; - final int secondEnd5PrimePosition = secondEnd.isNegativeStrand() ? secondEnd.getAlignmentEnd() : secondEnd.alignmentStart; + final int firstEnd5PrimePosition = firstEnd.isNegativeStrand() ? firstEnd.getAlignmentEnd(usePositionDeltaEncoding) : firstEnd.alignmentStart; + final int secondEnd5PrimePosition = secondEnd.isNegativeStrand() ? secondEnd.getAlignmentEnd(usePositionDeltaEncoding) : secondEnd.alignmentStart; final int adjustment = (secondEnd5PrimePosition >= firstEnd5PrimePosition) ? +1 : -1; return secondEnd5PrimePosition - firstEnd5PrimePosition + adjustment; diff --git a/src/main/java/htsjdk/samtools/cram/structure/CramCompressionRecord.java b/src/main/java/htsjdk/samtools/cram/structure/CramCompressionRecord.java index c19545447e..3e64bc7c07 100644 --- a/src/main/java/htsjdk/samtools/cram/structure/CramCompressionRecord.java +++ b/src/main/java/htsjdk/samtools/cram/structure/CramCompressionRecord.java @@ -24,6 +24,7 @@ import htsjdk.samtools.cram.encoding.readfeatures.Insertion; import htsjdk.samtools.cram.encoding.readfeatures.ReadFeature; import htsjdk.samtools.cram.encoding.readfeatures.SoftClip; +import htsjdk.samtools.util.Log; import java.util.Arrays; import java.util.Collection; @@ -49,12 +50,17 @@ public class CramCompressionRecord { private static final int HAS_MATE_DOWNSTREAM_FLAG = 0x4; private static final int UNKNOWN_BASES = 0x8; + private static final int UNINITIALIZED_END = -1; + private static final int UNINITIALIZED_SPAN = -1; + + private static final Log log = Log.getInstance(CramCompressionRecord.class); + // sequential index of the record in a stream: public int index = 0; public int alignmentStart; public int alignmentDelta; - private int alignmentEnd = -1; - private int alignmentSpan = -1; + private int alignmentEnd = UNINITIALIZED_END; + private int alignmentSpan = UNINITIALIZED_SPAN; public int readLength; @@ -152,20 +158,43 @@ public String toString() { return stringBuilder.toString(); } - public int getAlignmentSpan() { - if (alignmentSpan < 0) calculateAlignmentBoundaries(); + /** + * Check whether alignmentSpan has been initialized, and do so if it has not + * @param usePositionDeltaEncoding is this record's position delta-encoded? used to call isPlaced() + * @return the initialized alignmentSpan + */ + public int getAlignmentSpan(final boolean usePositionDeltaEncoding) { + if (alignmentSpan == UNINITIALIZED_SPAN) { + intializeAlignmentBoundaries(usePositionDeltaEncoding); + } return alignmentSpan; } - void calculateAlignmentBoundaries() { - if (isSegmentUnmapped()) { - alignmentSpan = 0; - alignmentEnd = SAMRecord.NO_ALIGNMENT_START; - } else if (readFeatures == null || readFeatures.isEmpty()) { - alignmentSpan = readLength; - alignmentEnd = alignmentStart + alignmentSpan - 1; - } else { - alignmentSpan = readLength; + /** + * Check whether alignmentEnd has been initialized, and do so if it has not + * @param usePositionDeltaEncoding is this record's position delta-encoded? used to call isPlaced() + * @return the initialized alignmentEnd + */ + public int getAlignmentEnd(final boolean usePositionDeltaEncoding) { + if (alignmentEnd == UNINITIALIZED_END) { + intializeAlignmentBoundaries(usePositionDeltaEncoding); + } + return alignmentEnd; + } + + // https://github.com/samtools/htsjdk/issues/1301 + // does not update alignmentSpan/alignmentEnd when the record changes + + private void intializeAlignmentBoundaries(final boolean usePositionDeltaEncoding) { + if (!isPlaced(usePositionDeltaEncoding)) { + alignmentSpan = Slice.NO_ALIGNMENT_SPAN; + alignmentEnd = Slice.NO_ALIGNMENT_END; + return; + } + + alignmentSpan = readLength; + + if (readFeatures != null) { for (final ReadFeature readFeature : readFeatures) { switch (readFeature.getOperator()) { case InsertBase.operator: @@ -185,13 +214,9 @@ void calculateAlignmentBoundaries() { break; } } - alignmentEnd = alignmentStart + alignmentSpan - 1; } - } - public int getAlignmentEnd() { - if (alignmentEnd < 0) calculateAlignmentBoundaries(); - return alignmentEnd; + alignmentEnd = alignmentStart + alignmentSpan - 1; } public boolean isMultiFragment() { @@ -207,7 +232,7 @@ public void setMultiFragment(final boolean multiFragment) { * Unmapped records may be stored in the same {@link Slice}s and {@link Container}s as mapped * records if they are placed. * - * @see #isPlaced() + * @see #isPlaced(boolean) * @return true if the record is unmapped */ public boolean isSegmentUnmapped() { @@ -219,16 +244,42 @@ public void setSegmentUnmapped(final boolean segmentUnmapped) { } /** - * Does this record have a valid placement/alignment location? This is independent of mapping status. - * It must have both a valid reference sequence ID and alignment start position to qualify. - * Unplaced reads may only be stored in Unmapped or Multiple Reference {@link Slice}s and {@link Container}s. + * Does this record have a valid placement/alignment location? + * + * It must have a valid reference sequence ID to be considered placed. + * In the case of absolute (non-delta) position encoding, it must also have a + * valid alignment start position, so we need to know if it is delta-encoded. + * + * Normally we expect to see that the unmapped flag is set for unplaced reads, + * so we log a WARNING here if the read is unplaced yet somehow mapped. * * @see #isSegmentUnmapped() + * @param usePositionDeltaEncoding is this read's position delta-encoded? if not, check alignment Start. * @return true if the record is placed */ - public boolean isPlaced() { - return sequenceId != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && - alignmentStart != SAMRecord.NO_ALIGNMENT_START; + public boolean isPlaced(final boolean usePositionDeltaEncoding) { + boolean placed = isPlacedInternal(!usePositionDeltaEncoding); + + if (!placed && !isSegmentUnmapped()) { + final String warning = String.format( + "Cram Compression Record [%s] does not have the unmapped flag set, " + + "but also does not have a valid placement on a reference sequence.", + this.toString()); + log.warn(warning); + } + + return placed; + } + + // check placement without regard to mapping; helper method for isPlaced() + private boolean isPlacedInternal(final boolean useAbsolutePositionEncoding) { + // placement requires a valid sequence ID + if (sequenceId == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + return false; + } + + // if an absolute alignment start coordinate is required but we have none, it's unplaced + return ! (useAbsolutePositionEncoding && alignmentStart == SAMRecord.NO_ALIGNMENT_START); } public boolean isFirstSegment() { diff --git a/src/main/java/htsjdk/samtools/cram/structure/Slice.java b/src/main/java/htsjdk/samtools/cram/structure/Slice.java index d442845249..63f5df96c6 100644 --- a/src/main/java/htsjdk/samtools/cram/structure/Slice.java +++ b/src/main/java/htsjdk/samtools/cram/structure/Slice.java @@ -52,6 +52,7 @@ public class Slice { public static final int MULTI_REFERENCE = -2; public static final int NO_ALIGNMENT_START = -1; public static final int NO_ALIGNMENT_SPAN = 0; + public static final int NO_ALIGNMENT_END = SAMRecord.NO_ALIGNMENT_START; // 0 private static final Log log = Log.getInstance(Slice.class); // as defined in the specs: @@ -388,7 +389,7 @@ public static Slice buildSlice(final List records, // Unmapped: all records are unmapped and unplaced final CramCompressionRecord firstRecord = records.get(0); - slice.sequenceId = firstRecord.isPlaced() ? + slice.sequenceId = firstRecord.isPlaced(header.APDelta) ? firstRecord.sequenceId : SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; @@ -398,19 +399,19 @@ public static Slice buildSlice(final List records, hasher.add(record); // flip an unmapped slice to multi-ref if the record is placed - if (slice.isUnmapped() && record.isPlaced()) { + if (slice.isUnmapped() && record.isPlaced(header.APDelta)) { slice.sequenceId = MULTI_REFERENCE; } else if (slice.isMappedSingleRef()) { // flip a single-ref slice to multi-ref if the record is unplaced or on a different ref - if (!record.isPlaced() || slice.sequenceId != record.sequenceId) { + if (!record.isPlaced(header.APDelta) || slice.sequenceId != record.sequenceId) { slice.sequenceId = MULTI_REFERENCE; } else { // calculate single ref slice alignment minAlStart = Math.min(record.alignmentStart, minAlStart); - maxAlEnd = Math.max(record.getAlignmentEnd(), maxAlEnd); + maxAlEnd = Math.max(record.getAlignmentEnd(header.APDelta), maxAlEnd); } } } diff --git a/src/test/java/htsjdk/samtools/cram/build/ContainerFactoryTest.java b/src/test/java/htsjdk/samtools/cram/build/ContainerFactoryTest.java index 31b5c0a049..d356195c20 100644 --- a/src/test/java/htsjdk/samtools/cram/build/ContainerFactoryTest.java +++ b/src/test/java/htsjdk/samtools/cram/build/ContainerFactoryTest.java @@ -57,6 +57,21 @@ static List getSingleRefRecords(final int recordCount) { return records; } + static List getUnmappedRecords() { + final List records = new ArrayList<>(); + for (int i = 0; i < 10; i++) { + final CramCompressionRecord record = createMappedRecord(i); + record.setSegmentUnmapped(true); + record.alignmentStart = SAMRecord.NO_ALIGNMENT_START; + record.sequenceId = SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; + records.add(record); + } + return records; + } + + // these two sets of records are "half" unplaced: they have either a valid reference index or start position, + // but not both. We treat these weird edge cases as unplaced. + static List getUnmappedNoRefRecords() { final List records = new ArrayList<>(); for (int i = 0; i < 10; i++) { @@ -150,6 +165,16 @@ public void testMapped() { assertContainerState(container, recordCount, sequenceId, alignmentStart, alignmentSpan); } + @Test + public void testUnmapped() { + final ContainerFactory factory = new ContainerFactory(getSAMFileHeaderForTests(), 10); + final Container container = factory.buildContainer(getUnmappedRecords()); + assertContainerState(container, 10, SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, Slice.NO_ALIGNMENT_START, Slice.NO_ALIGNMENT_SPAN); + } + + // these two sets of records are "half" unplaced: they have either a valid reference index or start position, + // but not both. We treat these weird edge cases as unplaced. + @Test public void testUnmappedNoReferenceId() { final ContainerFactory factory = new ContainerFactory(getSAMFileHeaderForTests(), 10); diff --git a/src/test/java/htsjdk/samtools/cram/build/ContainerParserTest.java b/src/test/java/htsjdk/samtools/cram/build/ContainerParserTest.java index 187e1c40ab..7ef129121f 100644 --- a/src/test/java/htsjdk/samtools/cram/build/ContainerParserTest.java +++ b/src/test/java/htsjdk/samtools/cram/build/ContainerParserTest.java @@ -54,6 +54,16 @@ private Object[][] refTestData() { add(0); }} }, + { + ContainerFactoryTest.getUnmappedRecords(), + new HashSet() {{ + add(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); + }} + }, + + // these two sets of records are "half" unplaced: they have either a valid reference index or start position, + // but not both. We treat these weird edge cases as unplaced. + { ContainerFactoryTest.getUnmappedNoRefRecords(), new HashSet() {{ diff --git a/src/test/java/htsjdk/samtools/cram/structure/CramCompressionRecordTest.java b/src/test/java/htsjdk/samtools/cram/structure/CramCompressionRecordTest.java index de366a828d..b212bbd756 100644 --- a/src/test/java/htsjdk/samtools/cram/structure/CramCompressionRecordTest.java +++ b/src/test/java/htsjdk/samtools/cram/structure/CramCompressionRecordTest.java @@ -4,85 +4,189 @@ import htsjdk.samtools.SAMRecord; import htsjdk.samtools.cram.encoding.readfeatures.*; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.ArrayList; +import java.util.List; /** * Created by vadim on 28/09/2015. */ public class CramCompressionRecordTest extends HtsjdkTest { - @Test - public void test_getAlignmentEnd() { - CramCompressionRecord r = new CramCompressionRecord(); - r.alignmentStart = 1; - r.setSegmentUnmapped(true); - Assert.assertEquals(r.getAlignmentEnd(), SAMRecord.NO_ALIGNMENT_START); + @DataProvider(name = "deltaEncodingTrueFalse") + private Object[][] tf() { + return new Object[][] { {true}, {false}}; + } - r = new CramCompressionRecord(); + @Test(dataProvider = "deltaEncodingTrueFalse") + public void test_getAlignmentSpanAndEnd(final boolean usePositionDeltaEncoding) { + CramCompressionRecord r = new CramCompressionRecord(); int readLength = 100; - r.alignmentStart = 1; + r.alignmentStart = 5; r.readLength = readLength; r.setSegmentUnmapped(false); - Assert.assertEquals(r.getAlignmentEnd(), r.readLength + r.alignmentStart - 1); + + int expectedSpan = r.readLength; + assertSpanAndEnd(r, usePositionDeltaEncoding, expectedSpan); r = new CramCompressionRecord(); - r.alignmentStart = 1; + r.alignmentStart = 10; r.readLength = readLength; r.setSegmentUnmapped(false); r.readFeatures = new ArrayList<>(); String softClip = "AAA"; r.readFeatures.add(new SoftClip(1, softClip.getBytes())); - Assert.assertEquals(r.getAlignmentEnd(), r.readLength + r.alignmentStart - 1 - softClip.length()); + + expectedSpan = r.readLength - softClip.length(); + assertSpanAndEnd(r, usePositionDeltaEncoding, expectedSpan); r = new CramCompressionRecord(); - r.alignmentStart = 1; + r.alignmentStart = 20; r.readLength = readLength; r.setSegmentUnmapped(false); r.readFeatures = new ArrayList<>(); int deletionLength = 5; r.readFeatures.add(new Deletion(1, deletionLength)); - Assert.assertEquals(r.getAlignmentEnd(), r.readLength + r.alignmentStart - 1 + deletionLength); + + expectedSpan = r.readLength + deletionLength; + assertSpanAndEnd(r, usePositionDeltaEncoding, expectedSpan); r = new CramCompressionRecord(); - r.alignmentStart = 1; + r.alignmentStart = 30; r.readLength = readLength; r.setSegmentUnmapped(false); r.readFeatures = new ArrayList<>(); String insertion = "CCCCCCCCCC"; r.readFeatures.add(new Insertion(1, insertion.getBytes())); - Assert.assertEquals(r.getAlignmentEnd(), r.readLength + r.alignmentStart - 1 - insertion.length()); + expectedSpan = r.readLength - insertion.length(); + assertSpanAndEnd(r, usePositionDeltaEncoding, expectedSpan); r = new CramCompressionRecord(); - r.alignmentStart = 1; + r.alignmentStart = 40; r.readLength = readLength; r.setSegmentUnmapped(false); r.readFeatures = new ArrayList<>(); r.readFeatures.add(new InsertBase(1, (byte) 'A')); - Assert.assertEquals(r.getAlignmentEnd(), r.readLength + r.alignmentStart - 1 - 1); + + expectedSpan = r.readLength - 1; + assertSpanAndEnd(r, usePositionDeltaEncoding, expectedSpan); + } + + private void assertSpanAndEnd(final CramCompressionRecord r, + final boolean usePositionDeltaEncoding, + final int expectedSpan) { + final int expectedEnd = expectedSpan + r.alignmentStart - 1; + Assert.assertEquals(r.getAlignmentSpan(usePositionDeltaEncoding), expectedSpan); + Assert.assertEquals(r.getAlignmentEnd(usePositionDeltaEncoding), expectedEnd); } - @Test - public void test_isPlaced() { + // https://github.com/samtools/htsjdk/issues/1301 + + // demonstrate the bug: alignmentEnd and alignmentSpan are set once only + // and do not update to reflect the state of the record + + @Test(dataProvider = "deltaEncodingTrueFalse") + public void demonstrateBug1301(final boolean usePositionDeltaEncoding) { final CramCompressionRecord r = new CramCompressionRecord(); + final int alignmentStart = 5; + final int readLength = 100; + r.alignmentStart = alignmentStart; + r.readLength = readLength; + r.setSegmentUnmapped(false); + Assert.assertEquals(r.getAlignmentEnd(usePositionDeltaEncoding), readLength + alignmentStart - 1); + Assert.assertEquals(r.getAlignmentSpan(usePositionDeltaEncoding), readLength); - // it's only Placed if both of these are valid + // matches original values, not the new ones - r.sequenceId = 5; - r.alignmentStart = 10; - Assert.assertTrue(r.isPlaced()); + r.alignmentStart++; + Assert.assertEquals(r.getAlignmentEnd(usePositionDeltaEncoding), readLength + alignmentStart - 1); + Assert.assertEquals(r.getAlignmentSpan(usePositionDeltaEncoding), readLength); + Assert.assertNotEquals(r.getAlignmentEnd(usePositionDeltaEncoding), readLength + r.alignmentStart - 1); - r.sequenceId = SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; - Assert.assertFalse(r.isPlaced()); + r.readLength++; + Assert.assertEquals(r.getAlignmentEnd(usePositionDeltaEncoding), readLength + alignmentStart - 1); + Assert.assertEquals(r.getAlignmentSpan(usePositionDeltaEncoding), readLength); + Assert.assertNotEquals(r.getAlignmentSpan(usePositionDeltaEncoding), r.readLength); + } + + @DataProvider(name = "placedTests") + private Object[][] placedTests() { + final List retval = new ArrayList<>(); + + final int validSeqId = 0; + final int[] sequenceIds = new int[]{ SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, validSeqId }; + final int validAlignmentStart = 1; + final int[] starts = new int[]{ SAMRecord.NO_ALIGNMENT_START, validAlignmentStart }; + final boolean[] deltas = new boolean[] { true, false }; + final boolean[] mappeds = new boolean[] { true, false }; + + for (final int sequenceId : sequenceIds) { + for (final int start : starts) { + for (final boolean delta : deltas) { + for (final boolean mapped : mappeds) { + + // logically, unplaced reads should never be mapped. + // when isPlaced() sees an unplaced-mapped read, it returns false and emits a log warning. + // it does not affect expectations here. + + boolean placementExpectation = true; + + // we also expect that read sequenceIds and alignmentStart are both valid or both invalid. + // however: we do handle the edge case where only one of the pair is valid + // by marking it as unplaced. + + if (sequenceId == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + placementExpectation = false; + } - r.alignmentStart = SAMRecord.NO_ALIGNMENT_START; - Assert.assertFalse(r.isPlaced()); + if (!delta && start == SAMRecord.NO_ALIGNMENT_START) { + placementExpectation = false; + } - r.sequenceId = 3; - Assert.assertFalse(r.isPlaced()); + retval.add(new Object[]{sequenceId, start, mapped, delta, placementExpectation}); + } + } + } + } + + return retval.toArray(new Object[0][0]); + } + + @Test(dataProvider = "placedTests") + public void test_isPlaced(final int sequenceId, + final int alignmentStart, + final boolean mapped, + final boolean usePositionDeltaEncoding, + final boolean placementExpectation) { + final CramCompressionRecord r = new CramCompressionRecord(); + r.sequenceId = sequenceId; + r.alignmentStart = alignmentStart; + r.setSegmentUnmapped(!mapped); + Assert.assertEquals(r.isPlaced(usePositionDeltaEncoding), placementExpectation); + } + + @Test(dataProvider = "placedTests") + public void test_placementForAlignmentSpanAndEnd(final int sequenceId, + final int alignmentStart, + final boolean mapped, + final boolean usePositionDeltaEncoding, + final boolean placementExpectation) { + final CramCompressionRecord r = new CramCompressionRecord(); + r.sequenceId = sequenceId; + r.alignmentStart = alignmentStart; + r.setSegmentUnmapped(!mapped); + final int readLength = 100; + r.readLength = readLength; - r.alignmentStart = 15; - Assert.assertTrue(r.isPlaced()); + if (placementExpectation) { + Assert.assertEquals(r.getAlignmentSpan(usePositionDeltaEncoding), readLength); + Assert.assertEquals(r.getAlignmentEnd(usePositionDeltaEncoding), alignmentStart + readLength - 1); + } + else { + Assert.assertEquals(r.getAlignmentSpan(usePositionDeltaEncoding), Slice.NO_ALIGNMENT_SPAN); + Assert.assertEquals(r.getAlignmentEnd(usePositionDeltaEncoding), Slice.NO_ALIGNMENT_END); + } } } diff --git a/src/test/java/htsjdk/samtools/cram/structure/SliceTests.java b/src/test/java/htsjdk/samtools/cram/structure/SliceTests.java index 301835ae5e..c47c538385 100644 --- a/src/test/java/htsjdk/samtools/cram/structure/SliceTests.java +++ b/src/test/java/htsjdk/samtools/cram/structure/SliceTests.java @@ -9,6 +9,7 @@ import htsjdk.samtools.cram.ref.ReferenceSource; import htsjdk.samtools.util.SequenceUtil; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; @@ -22,6 +23,8 @@ * Created by vadim on 07/12/2015. */ public class SliceTests extends HtsjdkTest { + private final int TEST_RECORD_COUNT = 10; + @Test public void testUnmappedValidateRef() { Slice slice = new Slice(); @@ -72,238 +75,217 @@ public void testFailsMD5Check() throws IOException { } } - @Test - public void testSingleRefBuild() { - final List records = new ArrayList<>(); - for (int i = 0; i < 10; i++) { - final CramCompressionRecord record = new CramCompressionRecord(); - record.readBases = "AAA".getBytes(); - record.qualityScores = "!!!".getBytes(); - record.readLength = 3; - record.readName = "" + i; - record.sequenceId = 0; - record.alignmentStart = i + 1; - record.setLastSegment(true); - record.readFeatures = Collections.emptyList(); - - if (i % 2 == 0) { - record.setSegmentUnmapped(true); - } else { - record.setSegmentUnmapped(false); - } - - records.add(record); - } - - final CompressionHeader header = new CompressionHeaderFactory().build(records, null, true); - - final Slice slice = Slice.buildSlice(records, header); - - Assert.assertEquals(slice.nofRecords, 10); - Assert.assertEquals(slice.sequenceId, 0); - Assert.assertEquals(slice.alignmentStart, 1); - Assert.assertEquals(slice.alignmentSpan, 12); + @DataProvider(name = "forBuildTests") + private Object[][] forBuildTests() { + return new Object[][] { + { + getSingleRefRecords(), + true, + TEST_RECORD_COUNT, 0, 1, 12 + }, + { + getMultiRefRecords(), + true, + TEST_RECORD_COUNT, Slice.MULTI_REFERENCE, Slice.NO_ALIGNMENT_START, Slice.NO_ALIGNMENT_SPAN + }, + { + getUnplacedRecords(), + true, + TEST_RECORD_COUNT, SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, Slice.NO_ALIGNMENT_START, Slice.NO_ALIGNMENT_SPAN + }, + + + // these two sets of records are "half" unplaced: they have either a valid reference index or start position, + // but not both. We treat these weird edge cases as unplaced. + + { + getNoRefRecords(), + true, + TEST_RECORD_COUNT, SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, Slice.NO_ALIGNMENT_START, Slice.NO_ALIGNMENT_SPAN + }, + + // show that not having a valid alignment start does nothing + // in the Coordinate-Sorted case (so it remains a Multi-Ref Slice) + // but causes the reads to be marked as Unplaced otherwise (so it becomes an Unmapped Slice) + + { + getNoStartRecords(), + true, + TEST_RECORD_COUNT, Slice.MULTI_REFERENCE, Slice.NO_ALIGNMENT_START, Slice.NO_ALIGNMENT_SPAN + }, + { + getNoStartRecords(), + false, + TEST_RECORD_COUNT, SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, Slice.NO_ALIGNMENT_START, Slice.NO_ALIGNMENT_SPAN + }, + + }; } - @Test - public void testMultiRefBuild() { - final List records = new ArrayList<>(); - for (int i = 0; i < 10; i++) { - final CramCompressionRecord record = new CramCompressionRecord(); - record.readBases = "AAA".getBytes(); - record.qualityScores = "!!!".getBytes(); - record.readLength = 3; - record.readName = "" + i; - record.sequenceId = i; - record.alignmentStart = i + 1; - record.setLastSegment(true); - record.readFeatures = Collections.emptyList(); - - if (i % 2 == 0) { - record.setSegmentUnmapped(true); - } else { - record.setSegmentUnmapped(false); - } - - records.add(record); - } - - final CompressionHeader header = new CompressionHeaderFactory().build(records, null, true); - + @Test(dataProvider = "forBuildTests") + public void testBuild(final List records, + final boolean coordinateSorted, + final int expectedRecordCount, + final int expectedSequenceId, + final int expectedAlignmentStart, + final int expectedAlignmentSpan) { + final CompressionHeader header = new CompressionHeaderFactory().build(records, null, coordinateSorted); final Slice slice = Slice.buildSlice(records, header); - - Assert.assertEquals(slice.nofRecords, 10); - Assert.assertEquals(slice.sequenceId, Slice.MULTI_REFERENCE); - Assert.assertEquals(slice.alignmentStart, Slice.NO_ALIGNMENT_START); - Assert.assertEquals(slice.alignmentSpan, Slice.NO_ALIGNMENT_SPAN); + assertSliceState(slice, expectedRecordCount, expectedSequenceId, expectedAlignmentStart, expectedAlignmentSpan); } // show that a slice with a single ref will initially be built as single-ref - // but adding an additional refs will make it multiref - // and another will keep it multiref + // but adding an additional ref will make it multiref + // and more will keep it multiref (mapped or otherwise) @Test public void testBuildStates() { final List records = new ArrayList<>(); - final CramCompressionRecord record0 = new CramCompressionRecord(); - record0.readBases = "AAA".getBytes(); - record0.qualityScores = "!!!".getBytes(); - record0.readLength = 3; - record0.readName = "1"; - record0.sequenceId = 0; - record0.alignmentStart = 1; - record0.setLastSegment(true); - record0.readFeatures = Collections.emptyList(); - record0.setSegmentUnmapped(false); - - records.add(record0); - - final CompressionHeader header0 = new CompressionHeaderFactory().build(records, null, true); - final Slice slice0 = Slice.buildSlice(records, header0); - - Assert.assertEquals(slice0.nofRecords, 1); - Assert.assertEquals(slice0.sequenceId, 0); - Assert.assertEquals(slice0.alignmentStart, 1); - Assert.assertEquals(slice0.alignmentSpan, 3); - - final CramCompressionRecord record1 = new CramCompressionRecord(); - record1.readBases = "AAA".getBytes(); - record1.qualityScores = "!!!".getBytes(); - record1.readLength = 3; - record1.readName = "1"; - record1.sequenceId = 1; - record1.alignmentStart = 1; - record1.setLastSegment(true); - record1.readFeatures = Collections.emptyList(); - record1.setSegmentUnmapped(false); - + final CramCompressionRecord record1 = createRecord(0, 0); records.add(record1); + assertSliceStateFromRecords(records, 1, 0, 1, 3); - final CompressionHeader header1 = new CompressionHeaderFactory().build(records, null, true); - final Slice slice1 = Slice.buildSlice(records, header1); + final CramCompressionRecord record2 = createRecord(1, 1); + records.add(record2); + assertSliceStateFromRecords(records, 2, Slice.MULTI_REFERENCE, Slice.NO_ALIGNMENT_START, Slice.NO_ALIGNMENT_SPAN); - Assert.assertEquals(slice1.nofRecords, 2); - Assert.assertEquals(slice1.sequenceId, Slice.MULTI_REFERENCE); - Assert.assertEquals(slice1.alignmentStart, Slice.NO_ALIGNMENT_START); - Assert.assertEquals(slice1.alignmentSpan, Slice.NO_ALIGNMENT_SPAN); + final CramCompressionRecord record3 = createRecord(2, 2); + records.add(record3); + assertSliceStateFromRecords(records, 3, Slice.MULTI_REFERENCE, Slice.NO_ALIGNMENT_START, Slice.NO_ALIGNMENT_SPAN); - final CramCompressionRecord unmapped = new CramCompressionRecord(); - unmapped.readBases = "AAA".getBytes(); - unmapped.qualityScores = "!!!".getBytes(); - unmapped.readLength = 3; - unmapped.readName = "1"; - unmapped.sequenceId = SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; + final CramCompressionRecord unmapped = createRecord(3, SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); unmapped.alignmentStart = SAMRecord.NO_ALIGNMENT_START; - unmapped.setLastSegment(true); - unmapped.readFeatures = Collections.emptyList(); unmapped.setSegmentUnmapped(true); - records.add(unmapped); - - final CompressionHeader header2 = new CompressionHeaderFactory().build(records, null, true); - final Slice slice2 = Slice.buildSlice(records, header2); - - Assert.assertEquals(slice2.nofRecords, 3); - Assert.assertEquals(slice2.sequenceId, Slice.MULTI_REFERENCE); - Assert.assertEquals(slice2.alignmentStart, Slice.NO_ALIGNMENT_START); - Assert.assertEquals(slice2.alignmentSpan, Slice.NO_ALIGNMENT_SPAN); + assertSliceStateFromRecords(records, 4, Slice.MULTI_REFERENCE, Slice.NO_ALIGNMENT_START, Slice.NO_ALIGNMENT_SPAN); } @Test - public void testUnmappedNoRefBuild() { + public void testSingleAndUnmappedBuild() { final List records = new ArrayList<>(); - for (int i = 0; i < 10; i++) { - final CramCompressionRecord record = new CramCompressionRecord(); - record.readBases = "AAA".getBytes(); - record.qualityScores = "!!!".getBytes(); - record.readLength = 3; - record.readName = "" + i; - record.sequenceId = SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; - record.alignmentStart = i + 1; - record.setLastSegment(true); - record.readFeatures = Collections.emptyList(); - record.setSegmentUnmapped(true); - records.add(record); - } + final CramCompressionRecord single = createRecord(1, 0); + records.add(single); + + final CramCompressionRecord unmapped = createRecord(1, 0); + unmapped.sequenceId = SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; + unmapped.alignmentStart = SAMRecord.NO_ALIGNMENT_START; + unmapped.setSegmentUnmapped(true); + records.add(unmapped); final CompressionHeader header = new CompressionHeaderFactory().build(records, null, true); final Slice slice = Slice.buildSlice(records, header); - - Assert.assertEquals(slice.nofRecords, 10); - Assert.assertEquals(slice.sequenceId, SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); - Assert.assertEquals(slice.alignmentStart, Slice.NO_ALIGNMENT_START); - Assert.assertEquals(slice.alignmentSpan, Slice.NO_ALIGNMENT_SPAN); + assertSliceState(slice, 2, Slice.MULTI_REFERENCE, Slice.NO_ALIGNMENT_START, Slice.NO_ALIGNMENT_SPAN); } - @Test - public void testUnmappedNoStartBuild() { - final List records = new ArrayList<>(); - for (int i = 0; i < 10; i++) { - final CramCompressionRecord record = new CramCompressionRecord(); - record.readBases = "AAA".getBytes(); - record.qualityScores = "!!!".getBytes(); - record.readLength = 3; - record.readName = "" + i; - record.sequenceId = i; - record.alignmentStart = SAMRecord.NO_ALIGNMENT_START; - record.setLastSegment(true); - record.readFeatures = Collections.emptyList(); - record.setSegmentUnmapped(true); - - records.add(record); - } - + private void assertSliceStateFromRecords(final List records, + final int expectedRecordCount, + final int expectedSequenceId, + final int expectedAlignmentStart, + final int expectedAlignmentSpan) { final CompressionHeader header = new CompressionHeaderFactory().build(records, null, true); - final Slice slice = Slice.buildSlice(records, header); + assertSliceState(slice, expectedRecordCount, expectedSequenceId, expectedAlignmentStart, expectedAlignmentSpan); + } - Assert.assertEquals(slice.nofRecords, 10); - Assert.assertEquals(slice.sequenceId, SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); - Assert.assertEquals(slice.alignmentStart, Slice.NO_ALIGNMENT_START); - Assert.assertEquals(slice.alignmentSpan, Slice.NO_ALIGNMENT_SPAN); + private void assertSliceState(final Slice slice, + final int expectedRecordCount, + final int expectedSequenceId, + final int expectedAlignmentStart, + final int expectedAlignmentSpan) { + + Assert.assertEquals(slice.nofRecords, expectedRecordCount); + Assert.assertEquals(slice.sequenceId, expectedSequenceId); + Assert.assertEquals(slice.alignmentStart, expectedAlignmentStart); + Assert.assertEquals(slice.alignmentSpan, expectedAlignmentSpan); + + if (expectedSequenceId == Slice.MULTI_REFERENCE) { + Assert.assertTrue(slice.isMultiref()); + } else if (expectedSequenceId == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + Assert.assertTrue(slice.isUnmapped()); + } else { + Assert.assertTrue(slice.isMappedSingleRef()); + } } + private CramCompressionRecord createRecord(final int index, + final int sequenceId) { + final CramCompressionRecord record = new CramCompressionRecord(); + record.readBases = "AAA".getBytes(); + record.qualityScores = "!!!".getBytes(); + record.readLength = 3; + record.readName = "" + index; + record.sequenceId = sequenceId; + record.alignmentStart = index + 1; + record.setLastSegment(true); + record.readFeatures = Collections.emptyList(); + record.setSegmentUnmapped(false); + return record; + } - @Test - public void testSingleAndUnmappedBuild() { - final List records = new ArrayList<>(); + // set half of the records created as unmapped, alternating by index - final CramCompressionRecord single = new CramCompressionRecord(); - single.readBases = "AAA".getBytes(); - single.qualityScores = "!!!".getBytes(); - single.readLength = 3; - single.readName = "1"; - single.sequenceId = 0; - single.alignmentStart = 1; - single.setLastSegment(true); - single.readFeatures = Collections.emptyList(); - single.setSegmentUnmapped(false); + private CramCompressionRecord createRecordHalfUnmapped(final int index, + final int sequenceId) { + final CramCompressionRecord record = createRecord(index, sequenceId); + if (index % 2 == 0) { + record.setSegmentUnmapped(true); + } else { + record.setSegmentUnmapped(false); + } + return record; + } - records.add(single); + private List getSingleRefRecords() { + final List records = new ArrayList<>(); + for (int index = 0; index < TEST_RECORD_COUNT; index++) { + records.add(createRecordHalfUnmapped(index, 0)); + } + return records; + } - final CramCompressionRecord unmapped = new CramCompressionRecord(); - unmapped.readBases = "AAA".getBytes(); - unmapped.qualityScores = "!!!".getBytes(); - unmapped.readLength = 3; - unmapped.readName = "unmapped"; - unmapped.sequenceId = SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; - unmapped.alignmentStart = SAMRecord.NO_ALIGNMENT_START; - unmapped.setLastSegment(true); - unmapped.readFeatures = Collections.emptyList(); - unmapped.setSegmentUnmapped(true); + private List getMultiRefRecords() { + final List records = new ArrayList<>(); + for (int index = 0; index < TEST_RECORD_COUNT; index++) { + records.add(createRecordHalfUnmapped(index, index)); + } + return records; + } - records.add(unmapped); + private List getUnplacedRecords() { + final List records = new ArrayList<>(); + for (int index = 0; index < TEST_RECORD_COUNT; index++) { + final CramCompressionRecord record = createRecord(index, SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); + record.alignmentStart = SAMRecord.NO_ALIGNMENT_START; + record.setSegmentUnmapped(true); + records.add(record); + } + return records; + } - final CompressionHeader header = new CompressionHeaderFactory().build(records, null, true); + // these two sets of records are "half" unplaced: they have either a valid reference index or start position, + // but not both. We treat these weird edge cases as unplaced. - final Slice slice = Slice.buildSlice(records, header); + private List getNoRefRecords() { + final List records = new ArrayList<>(); + for (int index = 0; index < TEST_RECORD_COUNT; index++) { + final CramCompressionRecord record = createRecord(index, SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); + record.setSegmentUnmapped(true); + records.add(record); + } + return records; + } - Assert.assertEquals(slice.nofRecords, 2); - Assert.assertEquals(slice.sequenceId, Slice.MULTI_REFERENCE); - Assert.assertEquals(slice.alignmentStart, Slice.NO_ALIGNMENT_START); - Assert.assertEquals(slice.alignmentSpan, Slice.NO_ALIGNMENT_SPAN); + private List getNoStartRecords() { + final List records = new ArrayList<>(); + for (int index = 0; index < TEST_RECORD_COUNT; index++) { + final CramCompressionRecord record = createRecord(index, index); + record.alignmentStart = SAMRecord.NO_ALIGNMENT_START; + record.setSegmentUnmapped(true); + records.add(record); + } + return records; } }