diff --git a/src/main/java/htsjdk/samtools/cram/build/CRAMReferenceRegion.java b/src/main/java/htsjdk/samtools/cram/build/CRAMReferenceRegion.java index 5082b84418..fb44b52e52 100644 --- a/src/main/java/htsjdk/samtools/cram/build/CRAMReferenceRegion.java +++ b/src/main/java/htsjdk/samtools/cram/build/CRAMReferenceRegion.java @@ -39,6 +39,8 @@ * {@link #fetchReferenceBases(int)} or {@link #fetchReferenceBasesByRegion(int, int, int)}. It caches the bases * from the previous request, along with metadata about the (0-based) start offset, and length of the * cached bases. + * + * NOTE: this class is not thread-safe/safe for concurrent access across threads. */ public class CRAMReferenceRegion { private static final Log log = Log.getInstance(CRAMReferenceRegion.class); @@ -113,17 +115,18 @@ public void fetchReferenceBases(final int referenceIndex) { // Re-resolve the reference bases if we don't have a current region or if the region we have // doesn't span the *entire* contig requested. + final SAMSequenceRecord newSequenceRecord = getSAMSequenceRecord(referenceIndex); if ((referenceIndex != this.referenceIndex) || regionStart != 0 || - (regionLength < referenceBases.length)) { + (regionLength != newSequenceRecord.getSequenceLength())) { setCurrentSequence(referenceIndex); - referenceBases = referenceSource.getReferenceBases(sequenceRecord, true); + referenceBases = referenceSource.getReferenceBases(newSequenceRecord, true); if (referenceBases == null) { throw new IllegalArgumentException( - String.format("A reference must be supplied (reference sequence %s not found).", sequenceRecord)); + String.format("A reference must be supplied (reference sequence %s not found).", newSequenceRecord)); } regionStart = 0; - regionLength = sequenceRecord.getSequenceLength(); + regionLength = newSequenceRecord.getSequenceLength(); } } diff --git a/src/test/java/htsjdk/samtools/CRAMAllEncodingStrategiesTest.java b/src/test/java/htsjdk/samtools/CRAMAllEncodingStrategiesTest.java index 040c7367a0..c379e74594 100644 --- a/src/test/java/htsjdk/samtools/CRAMAllEncodingStrategiesTest.java +++ b/src/test/java/htsjdk/samtools/CRAMAllEncodingStrategiesTest.java @@ -10,43 +10,104 @@ import htsjdk.samtools.util.Tuple; import htsjdk.utils.SamtoolsTestUtils; import org.testng.Assert; +import org.testng.SkipException; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.*; import java.util.*; +/** + * Test roundtripping files through the GATK writer using both the default HTSJDK encoding strategy, plus a variety + * of alternative encoding strategies, in order to stress test the writer implementation. Compares the results with + * the original file, and then roundtrips the newly written CRAM through the samtools writer, validating that samtools + * can consume the HTSJDK-written files with the expected level of roundtrip fidelity (CRAMs don't always roundtrip + * with complete bit-level fidelity, i.e, samtools will resurrect NM/MD tags whether they were present in the original + * file or not unless they are specifically excluded, etc.). So in some case, you can't use full SAMRecord comparisons, + * in which case we fall back to lenient equality and restrict the comparison to read names, bases, alignment start/stop, + * and quality scores. + */ public class CRAMAllEncodingStrategiesTest extends HtsjdkTest { private static final File TEST_DATA_DIR = new File("src/test/resources/htsjdk/samtools/cram"); private final CompressorCache compressorCache = new CompressorCache(); - @DataProvider(name="roundTripTestFiles") - public Object[][] roundTripTestFiles() { + @DataProvider(name="defaultStrategyRoundTripTestFiles") + public Object[][] defaultStrategyRoundTripTestFiles() { return new Object[][] { + // a test file with artificially small slices and containers to force multiple slices and containers { new File(TEST_DATA_DIR, "NA12878.20.21.1-100.100-SeqsPerSlice.500-unMapped.cram"), - new File(TEST_DATA_DIR, "human_g1k_v37.20.21.1-100.fasta") }, + new File(TEST_DATA_DIR, "human_g1k_v37.20.21.1-100.fasta"), + false, false }, + // the same file without the artificially small container constraints + { new File(TEST_DATA_DIR, "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.10m-10m100.cram"), + new File("src/test/resources/htsjdk/samtools/reference/human_g1k_v37.20.21.fasta.gz"), + false, false }, + // a test file with only unmapped reads + { new File(TEST_DATA_DIR, "NA12878.unmapped.cram"), + new File(TEST_DATA_DIR, "human_g1k_v37.20.21.1-100.fasta"), + false, false }, + // generated with samtools 1.19 from the gatk bam file CEUTrio.HiSeq.WGS.b37.NA12878.20.21.bam + { new File(TEST_DATA_DIR, "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.v3.0.samtools.cram"), + new File("src/test/resources/htsjdk/samtools/reference/human_g1k_v37.20.21.fasta.gz"), + true, false }, + + // these tests use lenient equality to only validate read names, bases, alignment start/stop, and qual scores + + // a user-contributed file with reads aligned only to the mito contig that has been rewritten (long ago) with GATK + { new File(TEST_DATA_DIR, "mitoAlignmentStartTestGATKGen.cram"), + new File(TEST_DATA_DIR, "mitoAlignmentStartTest.fa"), true, false }, + // the original user-contributed file with reads aligned only to the mito contig + { new File(TEST_DATA_DIR, "mitoAlignmentStartTest.cram"), + new File(TEST_DATA_DIR, "mitoAlignmentStartTest.fa"), true, false }, + // files created by rewriting the htsjdk test file src/test/resources/htsjdk/samtools/cram/mitoAlignmentStartTest.cram + // using code that replicates the first read (which is aligned to position 1 of the mito contig) either + // 10,000 or 20,000 times, to create a file with 2 or 3 containers, respectively, that have reads aligned to + // position 1 of the contig + { new File(TEST_DATA_DIR, "mitoAlignmentStartTest_2_containers_aligned_to_pos_1.cram"), + new File(TEST_DATA_DIR, "mitoAlignmentStartTest.fa"), true, false }, + { new File(TEST_DATA_DIR, "mitoAlignmentStartTest_3_containers_aligned_to_pos_1.cram"), + new File(TEST_DATA_DIR, "mitoAlignmentStartTest.fa"), true, false } }; } - @Test(dataProvider = "roundTripTestFiles") - public final void testRoundTripDefaultEncodingStrategy(final File sourceFile, final File referenceFile) throws IOException { + @Test(dataProvider = "defaultStrategyRoundTripTestFiles") + public final void testRoundTripDefaultEncodingStrategy( + final File sourceFile, + final File referenceFile, + final boolean lenientEquality, + final boolean emitDetail) throws IOException { + // test the default encoding strategy final CRAMEncodingStrategy testStrategy = new CRAMEncodingStrategy(); final File tempOutCRAM = File.createTempFile("testRoundTrip", ".cram"); tempOutCRAM.deleteOnExit(); CRAMTestUtils.writeToCRAMWithEncodingStrategy(testStrategy, sourceFile, tempOutCRAM, referenceFile); - assertRoundTripFidelity(sourceFile, tempOutCRAM, referenceFile, false); - assertRoundtripFidelityWithSamtools(tempOutCRAM, referenceFile); + assertRoundTripFidelity(sourceFile, tempOutCRAM, referenceFile, lenientEquality, emitDetail); + // test interop with samtools using this encoding + assertRoundtripFidelityWithSamtools(tempOutCRAM, referenceFile, lenientEquality, emitDetail); + } + + @DataProvider(name="encodingStrategiesTestFiles") + public Object[][] encodingStrategiesTestFiles() { + return new Object[][] { + { new File(TEST_DATA_DIR, "NA12878.20.21.1-100.100-SeqsPerSlice.500-unMapped.cram"), + new File(TEST_DATA_DIR, "human_g1k_v37.20.21.1-100.fasta"), false, false }, + }; } - @Test(dataProvider = "roundTripTestFiles") - public final void testAllEncodingStrategyCombinations(final File cramSourceFile, final File referenceFile) throws IOException { + @Test(dataProvider = "encodingStrategiesTestFiles") + public final void testAllEncodingStrategyCombinations( + final File cramSourceFile, + final File referenceFile, + final boolean lenientEquality, + final boolean emitDetail) throws IOException { for (final Tuple testStrategy : getAllEncodingStrategies()) { final File tempOutCRAM = File.createTempFile("allEncodingStrategyCombinations", ".cram"); tempOutCRAM.deleteOnExit(); CRAMTestUtils.writeToCRAMWithEncodingStrategy(testStrategy.b, cramSourceFile, tempOutCRAM, referenceFile); - assertRoundTripFidelity(cramSourceFile, tempOutCRAM, referenceFile, false); - assertRoundtripFidelityWithSamtools(tempOutCRAM, referenceFile); + assertRoundTripFidelity(cramSourceFile, tempOutCRAM, referenceFile, lenientEquality, emitDetail); + // test interop with samtools using this encoding + assertRoundtripFidelityWithSamtools(tempOutCRAM, referenceFile, lenientEquality, emitDetail); } } @@ -82,6 +143,7 @@ public void assertRoundTripFidelity( final File sourceFile, final File targetCRAMFile, final File referenceFile, + final boolean lenientEquality, final boolean emitDetail) throws IOException { try (final SamReader sourceReader = SamReaderFactory.makeDefault() .referenceSequence(referenceFile) @@ -91,7 +153,15 @@ public void assertRoundTripFidelity( final SAMRecordIterator sourceIterator = sourceReader.iterator(); final SAMRecordIterator targetIterator = copyReader.getIterator(); while (sourceIterator.hasNext() && targetIterator.hasNext()) { - if (emitDetail) { + if (lenientEquality) { + final SAMRecord sourceRec = sourceIterator.next(); + final SAMRecord targetRec = targetIterator.next(); + Assert.assertEquals(targetRec.getReadName(), sourceRec.getReadName()); + Assert.assertEquals(targetRec.getAlignmentStart(), sourceRec.getAlignmentStart()); + Assert.assertEquals(targetRec.getAlignmentEnd(), sourceRec.getAlignmentEnd()); + Assert.assertEquals(targetRec.getReadBases(), sourceRec.getReadBases()); + Assert.assertEquals(targetRec.getBaseQualities(), sourceRec.getBaseQualities()); + } else if (emitDetail) { final SAMRecord sourceRec = sourceIterator.next(); final SAMRecord targetRec = targetIterator.next(); if (!sourceRec.equals(targetRec)) { @@ -108,13 +178,19 @@ public void assertRoundTripFidelity( } } - private void assertRoundtripFidelityWithSamtools(final File sourceCRAM, final File referenceFile) throws IOException { + private void assertRoundtripFidelityWithSamtools( + final File sourceCRAM, + final File referenceFile, + final boolean lenientEquality, + final boolean emitDetail) throws IOException { if (SamtoolsTestUtils.isSamtoolsAvailable()) { final File samtoolsOutFile = SamtoolsTestUtils.convertToCRAM( sourceCRAM, referenceFile, "--input-fmt-option decode_md=0 --output-fmt-option store_md=0 --output-fmt-option store_nm=0"); - assertRoundTripFidelity(sourceCRAM, samtoolsOutFile, referenceFile, false); + assertRoundTripFidelity(sourceCRAM, samtoolsOutFile, referenceFile, lenientEquality, emitDetail); + } else { + throw new SkipException("samtools is not installed, skipping test"); } } diff --git a/src/test/java/htsjdk/samtools/cram/ref/CRAMReferenceRegionTest.java b/src/test/java/htsjdk/samtools/cram/ref/CRAMReferenceRegionTest.java index 42749be2aa..b2ab52489e 100644 --- a/src/test/java/htsjdk/samtools/cram/ref/CRAMReferenceRegionTest.java +++ b/src/test/java/htsjdk/samtools/cram/ref/CRAMReferenceRegionTest.java @@ -3,6 +3,7 @@ import htsjdk.HtsjdkTest; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.cram.build.CRAMReferenceRegion; +import htsjdk.samtools.cram.structure.AlignmentContext; import htsjdk.samtools.cram.structure.CRAMStructureTestHelper; import htsjdk.samtools.reference.InMemoryReferenceSequenceFile; import org.testng.Assert; @@ -167,6 +168,44 @@ public void testGetReferenceBasesByRegionPositive( Assert.assertEquals(bases, Arrays.copyOfRange(fullContigBases, requestedOffset, requestedOffset + requestedLength)); } + // Simulate the state transitions that occur when writing a CRAM file; specifically, use transitions that mirror + // the ones that occur when writing a CRAM with the conditions that affect (and that fail without the fix to) + // https://github.com/broadinstitute/gatk/issues/8768, i.e., a container with one or more containers with reads + // aligned to position 1 of a given contig, followed by one or more containers with reads aligned past position 1 + // of the same contig. + @Test + public void testSerialStateTransitions() { + // start with an entire reference sequence + final CRAMReferenceRegion cramReferenceRegion = getAlternatingReferenceRegion(); + cramReferenceRegion.fetchReferenceBases(CRAMStructureTestHelper.REFERENCE_SEQUENCE_ZERO); + final long fullRegionFragmentLength = cramReferenceRegion.getRegionLength(); + Assert.assertEquals(fullRegionFragmentLength, CRAMStructureTestHelper.REFERENCE_CONTIG_LENGTH); + + // transition to a shorter reference fragment using fetchReferenceBasesByRegion, then back to the full region + final int SHORT_FRAGMENT_LENGTH = 5; + Assert.assertTrue(SHORT_FRAGMENT_LENGTH < fullRegionFragmentLength); + cramReferenceRegion.fetchReferenceBasesByRegion(CRAMStructureTestHelper.REFERENCE_SEQUENCE_ZERO, 0, SHORT_FRAGMENT_LENGTH); + Assert.assertEquals(cramReferenceRegion.getRegionLength(), SHORT_FRAGMENT_LENGTH); + + // now transition back to the full sequence; this is where the bug previously would have occurred + cramReferenceRegion.fetchReferenceBases(CRAMStructureTestHelper.REFERENCE_SEQUENCE_ZERO); + // this assert would fail without the fix + Assert.assertEquals(cramReferenceRegion.getRegionLength(), fullRegionFragmentLength); + + // transition to a shorter region fragment length using fetchReferenceBasesByRegion(AlignmentContext), then back to the full region + Assert.assertTrue(SHORT_FRAGMENT_LENGTH < fullRegionFragmentLength); + cramReferenceRegion.fetchReferenceBasesByRegion( + new AlignmentContext( + new ReferenceContext(CRAMStructureTestHelper.REFERENCE_SEQUENCE_ZERO), + 1, + SHORT_FRAGMENT_LENGTH)); + Assert.assertEquals(cramReferenceRegion.getRegionLength(), SHORT_FRAGMENT_LENGTH); + + // now transition back to the full sequence + cramReferenceRegion.fetchReferenceBases(CRAMStructureTestHelper.REFERENCE_SEQUENCE_ZERO); + Assert.assertEquals(cramReferenceRegion.getRegionLength(), fullRegionFragmentLength); + } + @Test public void testGetReferenceBasesByRegionExceedsContigLength() { int regionStart = CRAMStructureTestHelper.REFERENCE_CONTIG_LENGTH / 2; diff --git a/src/test/resources/htsjdk/samtools/cram/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.v3.0.samtools.cram b/src/test/resources/htsjdk/samtools/cram/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.v3.0.samtools.cram new file mode 100644 index 0000000000..5ee9f75662 Binary files /dev/null and b/src/test/resources/htsjdk/samtools/cram/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.v3.0.samtools.cram differ diff --git a/src/test/resources/htsjdk/samtools/cram/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.v3.0.samtools.cram.crai b/src/test/resources/htsjdk/samtools/cram/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.v3.0.samtools.cram.crai new file mode 100644 index 0000000000..05fb32f0c4 Binary files /dev/null and b/src/test/resources/htsjdk/samtools/cram/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.v3.0.samtools.cram.crai differ diff --git a/src/test/resources/htsjdk/samtools/cram/mitoAlignmentStartTest_2_containers_aligned_to_pos_1.cram b/src/test/resources/htsjdk/samtools/cram/mitoAlignmentStartTest_2_containers_aligned_to_pos_1.cram new file mode 100644 index 0000000000..d967ea1ed1 Binary files /dev/null and b/src/test/resources/htsjdk/samtools/cram/mitoAlignmentStartTest_2_containers_aligned_to_pos_1.cram differ diff --git a/src/test/resources/htsjdk/samtools/cram/mitoAlignmentStartTest_3_containers_aligned_to_pos_1.cram b/src/test/resources/htsjdk/samtools/cram/mitoAlignmentStartTest_3_containers_aligned_to_pos_1.cram new file mode 100644 index 0000000000..3eea1c1b51 Binary files /dev/null and b/src/test/resources/htsjdk/samtools/cram/mitoAlignmentStartTest_3_containers_aligned_to_pos_1.cram differ