Use latest SBI code from htsjdk PR (samtools/htsjdk#1138)

broadinstitute · Jul 26, 2018 · e375860 · e375860
1 parent 0bda7b8
commit e375860
Show file tree

Hide file tree

Showing 8 changed files with 38 additions and 32 deletions.
diff --git a/build.gradle b/build.gradle
@@ -282,10 +282,6 @@ dependencies {
     compile 'org.testng:testng:' + testNGVersion //compile instead of testCompile because it is needed for test infrastructure that needs to be packaged
     compile 'org.apache.hadoop:hadoop-minicluster:' + hadoopVersion
 
-    compile('org.seqdoop:hadoop-bam:' + hadoopBamVersion) {
-        exclude group: 'org.apache.hadoop'
-        exclude module: 'htsjdk'
-    }
     compile files('lib/disq-0.0.1-SNAPSHOT.jar')
     compile('org.apache.hadoop:hadoop-client:' + hadoopVersion) // should be a 'provided' dependency
     compile('com.github.jsr203hadoop:jsr203hadoop:1.0.3')

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java
@@ -1,6 +1,10 @@
 package org.broadinstitute.hellbender.tools.spark;
 
 import htsjdk.samtools.*;
+import htsjdk.samtools.BAMSBIIndexer;
+import htsjdk.samtools.seekablestream.SeekableFileStream;
+import htsjdk.samtools.seekablestream.SeekableStream;
+import htsjdk.samtools.util.BlockCompressedFilePointerUtil;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.broadinstitute.barclay.argparser.Argument;
@@ -14,7 +18,6 @@
 import org.broadinstitute.hellbender.utils.io.IOUtils;
 import org.broadinstitute.hellbender.utils.read.ReadConstants;
 import org.codehaus.plexus.util.FileUtils;
-import org.seqdoop.hadoop_bam.SplittingBAMIndexer;
 import picard.cmdline.programgroups.OtherProgramGroup;
 
 import java.io.*;
@@ -70,15 +73,15 @@ public final class CreateHadoopBamSplittingIndex extends CommandLineProgram {
 
     @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
             shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
-            doc = "The BAM splitting_index file. If this is unspecified an index will be created with the same name as " +
-                    "the input file but with the additional extension " + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION,
+            doc = "The splitting index (SBI) file. If this is unspecified an index will be created with the same name as " +
+                    "the input file but with the additional extension " + SBIIndex.FILE_EXTENSION,
             optional = true)
     public File output;
 
     @Argument(fullName = SPLITTING_INDEX_GRANULARITY_LONG_NAME,
             doc = "Splitting index granularity, an entry is created in the index every this many reads.",
             optional = true)
-    public int granularity = SplittingBAMIndexer.DEFAULT_GRANULARITY;
+    public long granularity = SBIIndexWriter.DEFAULT_GRANULARITY;
 
     @Argument(fullName = CREATE_BAI_LONG_NAME,
             doc = "Set this to create a bai index at the same time as creating a splitting index",
@@ -89,7 +92,7 @@ public final class CreateHadoopBamSplittingIndex extends CommandLineProgram {
     @Override
     public Object doWork() {
         if( granularity <= 0) {
-            throw new CommandLineException.BadArgumentValue(SPLITTING_INDEX_GRANULARITY_LONG_NAME, Integer.toString(granularity), "Granularity must be > 0");
+            throw new CommandLineException.BadArgumentValue(SPLITTING_INDEX_GRANULARITY_LONG_NAME, Long.toString(granularity), "Granularity must be > 0");
         }
         final File index = getOutputFile(output, inputBam);
         if(createBai){
@@ -101,19 +104,17 @@ public Object doWork() {
         return 0;
     }
 
-    private static void createOnlySplittingIndex(final File inputBam, final File index, final int granularity) {
+    private static void createOnlySplittingIndex(final File inputBam, final File index, final long granularity) {
         assertIsBam(inputBam);
-        //createBamSplittingIndex(inputBam, getOutputFile(output, inputBam), readValidationStringency, granularity);
-        try(BufferedInputStream in = new BufferedInputStream(new FileInputStream(inputBam));
-          BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(index))) {
-                SplittingBAMIndexer.index(in, out, inputBam.length(), granularity);
-
+        try(SeekableStream in = new SeekableFileStream(inputBam);
+            BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(index))) {
+                BAMSBIIndexer.createIndex(in, out, granularity);
         } catch (final IOException e) {
             throw new UserException("Couldn't create splitting index", e);
         }
     }
 
-    private static void createBaiAndSplittingIndex(final File inputBam, final File index, final int granularity, final ValidationStringency readValidationStringency) {
+    private static void createBaiAndSplittingIndex(final File inputBam, final File index, final long granularity, final ValidationStringency readValidationStringency) {
         assertIsBam(inputBam);
         try(SamReader reader = SamReaderFactory.makeDefault()
                 .validationStringency(readValidationStringency)
@@ -122,14 +123,24 @@ private static void createBaiAndSplittingIndex(final File inputBam, final File i
             BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(index))) {
                 final SAMFileHeader header = reader.getFileHeader();
                 assertBamIsCoordinateSorted(header);
-                final SplittingBAMIndexer indexer = new SplittingBAMIndexer(out, granularity);
+                final SBIIndexWriter indexer = new SBIIndexWriter(out, granularity);
 
                 final BAMIndexer bamIndexer = new BAMIndexer(IOUtils.replaceExtension(index, BAMIndex.BAMIndexSuffix), header);
+                BAMFileSpan lastFilePointer = null;
                 for(final SAMRecord read : reader){
-                    indexer.processAlignment(read);
+                    BAMFileSpan filePointer = (BAMFileSpan) read.getFileSource().getFilePointer();
+                    indexer.processRecord(filePointer.getFirstOffset());
                     bamIndexer.processAlignment(read);
+                    lastFilePointer = filePointer;
+                }
+                long nextStart = 0;
+                if (lastFilePointer != null && !lastFilePointer.getChunks().isEmpty()) {
+                    nextStart = lastFilePointer.getChunks().get(0).getChunkEnd();
+                }
+                if (nextStart == 0) {
+                    nextStart = BlockCompressedFilePointerUtil.makeFilePointer(inputBam.length()); // default to file length (in case of no reads)
                 }
-                indexer.finish(inputBam.length());
+                indexer.finish(nextStart, inputBam.length()); // nextStart is start of next record that would be added
                 bamIndexer.finish();
         } catch (final IOException e) {
             throw new UserException("Couldn't create splitting index", e);
@@ -153,11 +164,11 @@ private static void assertIsBam(final File inputBam) {
 
     private static File getOutputFile(final File suggestedOutput, final File input) {
         if(suggestedOutput == null){
-            return new File(input.getPath() + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION);
+            return new File(input.getPath() + SBIIndex.FILE_EXTENSION);
         } else {
-            if (!suggestedOutput.getAbsolutePath().endsWith("bam" + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION)){
-                logger.warn("Creating a splitting index with an extension that doesn't match "
-                        + "bam"+SplittingBAMIndexer.OUTPUT_FILE_EXTENSION + ".  Output file: "+suggestedOutput);
+            if (!suggestedOutput.getAbsolutePath().endsWith("bam" + SBIIndex.FILE_EXTENSION)){
+                logger.warn("Creating a splitting index (SBI) with an extension that doesn't match "
+                        + "bam"+SBIIndex.FILE_EXTENSION + ".  Output file: "+suggestedOutput);
             }
             return suggestedOutput;
         }

diff --git a/...t/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSinkUnitTest.java b/...t/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSinkUnitTest.java
@@ -4,6 +4,7 @@
 import htsjdk.samtools.SAMFileHeader;
 import htsjdk.samtools.SAMRecord;
 import htsjdk.samtools.SAMRecordCoordinateComparator;
+import htsjdk.samtools.SBIIndex;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -19,7 +20,6 @@
 import org.broadinstitute.hellbender.GATKBaseTest;
 import org.broadinstitute.hellbender.utils.test.MiniClusterUtils;
 import org.broadinstitute.hellbender.utils.test.ReadTestUtils;
-import org.seqdoop.hadoop_bam.SplittingBAMIndexer;
 import org.testng.Assert;
 import org.testng.annotations.AfterClass;
 import org.testng.annotations.BeforeClass;
@@ -155,7 +155,7 @@ private void assertSingleShardedWritingWorks(String inputBam, String referenceFi
 
         // check that a splitting bai file is created
         if (IOUtils.isBamFileName(outputPath)) {
-            //Assert.assertTrue(Files.exists(IOUtils.getPath(outputPath + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION)));
+            //Assert.assertTrue(Files.exists(IOUtils.getPath(outputPath + SBIIndex.FILE_EXTENSION)));
         }
 
         JavaRDD<GATKRead> rddParallelReads2 = readSource.getParallelReads(outputPath, referenceFile);

diff --git a/...g/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndexIntegrationTest.java b/...g/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndexIntegrationTest.java
@@ -1,14 +1,13 @@
 package org.broadinstitute.hellbender.tools.spark;
 
 import htsjdk.samtools.BAMIndex;
+import htsjdk.samtools.SBIIndex;
 import htsjdk.samtools.util.IOUtil;
 import org.broadinstitute.barclay.argparser.CommandLineException;
 import org.broadinstitute.hellbender.CommandLineProgramTest;
 import org.broadinstitute.hellbender.exceptions.UserException;
 import org.broadinstitute.hellbender.utils.io.IOUtils;
 import org.broadinstitute.hellbender.utils.test.ArgumentsBuilder;
-import org.seqdoop.hadoop_bam.SplittingBAMIndex;
-import org.seqdoop.hadoop_bam.SplittingBAMIndexer;
 import org.testng.Assert;
 import org.testng.annotations.DataProvider;
 import org.testng.annotations.Test;
@@ -43,15 +42,15 @@ public void testCreateSplittingIndex(final File bam) throws IOException {
         assertIndexIsNotEmpty(splittingIndex);
 
         //checked in index created with
-        // java -cp target/hadoop-bam-7.4.1-SNAPSHOT-jar-with-dependencies.jar org.seqdoop.hadoop_bam.SplittingBAMIndexer  1 <filename>
-        final File expectedSplittingIndex = new File(bam.toPath() + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION);
+        // ./gatk CreateHadoopBamSplittingIndex --input <filename> --splitting-index-granularity 1
+        final File expectedSplittingIndex = new File(bam.toPath() + SBIIndex.FILE_EXTENSION);
 
         IOUtil.assertFilesEqual(splittingIndex, expectedSplittingIndex);
     }
 
     private static void assertIndexIsNotEmpty(final File splittingIndex) throws IOException {
         Assert.assertTrue(splittingIndex.exists());
-        final SplittingBAMIndex splittingBAMIndex = new SplittingBAMIndex(splittingIndex);
+        final SBIIndex splittingBAMIndex = SBIIndex.load(splittingIndex.toPath());
         Assert.assertTrue(splittingBAMIndex.size() > 0 );
     }
 
@@ -82,7 +81,7 @@ public void testUnspecifiedOutputProducesAdjacentIndex(final File bam) throws IO
         // we're going to write an index next to it on disk, and we don't want to write into the test resources folder
         final File bamCopy = createTempFile("copy-"+bam, ".bam");
         Files.copy(bam.toPath(), bamCopy.toPath(), StandardCopyOption.REPLACE_EXISTING);
-        final File expectedIndex = new File(bamCopy.toPath() + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION);
+        final File expectedIndex = new File(bamCopy.toPath() + SBIIndex.FILE_EXTENSION);
         Assert.assertFalse(expectedIndex.exists());
         final ArgumentsBuilder args = new ArgumentsBuilder().addInput(bamCopy);
         this.runCommandLine(args);
@@ -131,7 +130,7 @@ public void testCantCreateBaiForUnsortedFile(){
     }
 
     private static File getTempIndexFile() {
-        return createTempFile("index", "bam.splitting-bai");
+        return createTempFile("index", "bam" + SBIIndex.FILE_EXTENSION);
     }
 
 

diff --git a/...g/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex/count_reads.bam.sbi b/...g/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex/count_reads.bam.sbi
diff --git a/...titute/hellbender/tools/spark/CreateHadoopBamSplittingIndex/count_reads.bam.splitting-bai b/...titute/hellbender/tools/spark/CreateHadoopBamSplittingIndex/count_reads.bam.splitting-bai
diff --git a/...institute/hellbender/tools/spark/CreateHadoopBamSplittingIndex/count_reads_sorted.bam.sbi b/...institute/hellbender/tools/spark/CreateHadoopBamSplittingIndex/count_reads_sorted.bam.sbi
diff --git a/...hellbender/tools/spark/CreateHadoopBamSplittingIndex/count_reads_sorted.bam.splitting-bai b/...hellbender/tools/spark/CreateHadoopBamSplittingIndex/count_reads_sorted.bam.splitting-bai