From e37586002175fc76ae5403a8f14cd7f9bec4c4b1 Mon Sep 17 00:00:00 2001
From: Tom White <tom@cloudera.com>
Date: Wed, 30 May 2018 11:51:31 +0100
Subject: [PATCH] Use latest SBI code from htsjdk PR
 (https://github.com/samtools/htsjdk/pull/1138)

---
 build.gradle                                  |   4 --
 .../spark/CreateHadoopBamSplittingIndex.java  |  49 +++++++++++-------
 .../datasources/ReadsSparkSinkUnitTest.java   |   4 +-
 ...adoopBamSplittingIndexIntegrationTest.java |  13 +++--
 .../count_reads.bam.sbi                       | Bin 0 -> 140 bytes
 .../count_reads.bam.splitting-bai             | Bin 80 -> 80 bytes
 .../count_reads_sorted.bam.sbi                | Bin 0 -> 140 bytes
 .../count_reads_sorted.bam.splitting-bai      | Bin 80 -> 80 bytes
 8 files changed, 38 insertions(+), 32 deletions(-)
 create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex/count_reads.bam.sbi
 create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex/count_reads_sorted.bam.sbi

diff --git a/build.gradle b/build.gradle
index db369118cf1..1783b156d01 100644
--- a/build.gradle
+++ b/build.gradle
@@ -282,10 +282,6 @@ dependencies {
     compile 'org.testng:testng:' + testNGVersion //compile instead of testCompile because it is needed for test infrastructure that needs to be packaged
     compile 'org.apache.hadoop:hadoop-minicluster:' + hadoopVersion
 
-    compile('org.seqdoop:hadoop-bam:' + hadoopBamVersion) {
-        exclude group: 'org.apache.hadoop'
-        exclude module: 'htsjdk'
-    }
     compile files('lib/disq-0.0.1-SNAPSHOT.jar')
     compile('org.apache.hadoop:hadoop-client:' + hadoopVersion) // should be a 'provided' dependency
     compile('com.github.jsr203hadoop:jsr203hadoop:1.0.3')
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java
index 08fa0a87d9d..3841f9ca313 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java
@@ -1,6 +1,10 @@
 package org.broadinstitute.hellbender.tools.spark;
 
 import htsjdk.samtools.*;
+import htsjdk.samtools.BAMSBIIndexer;
+import htsjdk.samtools.seekablestream.SeekableFileStream;
+import htsjdk.samtools.seekablestream.SeekableStream;
+import htsjdk.samtools.util.BlockCompressedFilePointerUtil;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.broadinstitute.barclay.argparser.Argument;
@@ -14,7 +18,6 @@
 import org.broadinstitute.hellbender.utils.io.IOUtils;
 import org.broadinstitute.hellbender.utils.read.ReadConstants;
 import org.codehaus.plexus.util.FileUtils;
-import org.seqdoop.hadoop_bam.SplittingBAMIndexer;
 import picard.cmdline.programgroups.OtherProgramGroup;
 
 import java.io.*;
@@ -70,15 +73,15 @@ public final class CreateHadoopBamSplittingIndex extends CommandLineProgram {
 
     @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
             shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
-            doc = "The BAM splitting_index file. If this is unspecified an index will be created with the same name as " +
-                    "the input file but with the additional extension " + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION,
+            doc = "The splitting index (SBI) file. If this is unspecified an index will be created with the same name as " +
+                    "the input file but with the additional extension " + SBIIndex.FILE_EXTENSION,
             optional = true)
     public File output;
 
     @Argument(fullName = SPLITTING_INDEX_GRANULARITY_LONG_NAME,
             doc = "Splitting index granularity, an entry is created in the index every this many reads.",
             optional = true)
-    public int granularity = SplittingBAMIndexer.DEFAULT_GRANULARITY;
+    public long granularity = SBIIndexWriter.DEFAULT_GRANULARITY;
 
     @Argument(fullName = CREATE_BAI_LONG_NAME,
             doc = "Set this to create a bai index at the same time as creating a splitting index",
@@ -89,7 +92,7 @@ public final class CreateHadoopBamSplittingIndex extends CommandLineProgram {
     @Override
     public Object doWork() {
         if( granularity <= 0) {
-            throw new CommandLineException.BadArgumentValue(SPLITTING_INDEX_GRANULARITY_LONG_NAME, Integer.toString(granularity), "Granularity must be > 0");
+            throw new CommandLineException.BadArgumentValue(SPLITTING_INDEX_GRANULARITY_LONG_NAME, Long.toString(granularity), "Granularity must be > 0");
         }
         final File index = getOutputFile(output, inputBam);
         if(createBai){
@@ -101,19 +104,17 @@ public Object doWork() {
         return 0;
     }
 
-    private static void createOnlySplittingIndex(final File inputBam, final File index, final int granularity) {
+    private static void createOnlySplittingIndex(final File inputBam, final File index, final long granularity) {
         assertIsBam(inputBam);
-        //createBamSplittingIndex(inputBam, getOutputFile(output, inputBam), readValidationStringency, granularity);
-        try(BufferedInputStream in = new BufferedInputStream(new FileInputStream(inputBam));
-          BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(index))) {
-                SplittingBAMIndexer.index(in, out, inputBam.length(), granularity);
-
+        try(SeekableStream in = new SeekableFileStream(inputBam);
+            BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(index))) {
+                BAMSBIIndexer.createIndex(in, out, granularity);
         } catch (final IOException e) {
             throw new UserException("Couldn't create splitting index", e);
         }
     }
 
-    private static void createBaiAndSplittingIndex(final File inputBam, final File index, final int granularity, final ValidationStringency readValidationStringency) {
+    private static void createBaiAndSplittingIndex(final File inputBam, final File index, final long granularity, final ValidationStringency readValidationStringency) {
         assertIsBam(inputBam);
         try(SamReader reader = SamReaderFactory.makeDefault()
                 .validationStringency(readValidationStringency)
@@ -122,14 +123,24 @@ private static void createBaiAndSplittingIndex(final File inputBam, final File i
             BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(index))) {
                 final SAMFileHeader header = reader.getFileHeader();
                 assertBamIsCoordinateSorted(header);
-                final SplittingBAMIndexer indexer = new SplittingBAMIndexer(out, granularity);
+                final SBIIndexWriter indexer = new SBIIndexWriter(out, granularity);
 
                 final BAMIndexer bamIndexer = new BAMIndexer(IOUtils.replaceExtension(index, BAMIndex.BAMIndexSuffix), header);
+                BAMFileSpan lastFilePointer = null;
                 for(final SAMRecord read : reader){
-                    indexer.processAlignment(read);
+                    BAMFileSpan filePointer = (BAMFileSpan) read.getFileSource().getFilePointer();
+                    indexer.processRecord(filePointer.getFirstOffset());
                     bamIndexer.processAlignment(read);
+                    lastFilePointer = filePointer;
+                }
+                long nextStart = 0;
+                if (lastFilePointer != null && !lastFilePointer.getChunks().isEmpty()) {
+                    nextStart = lastFilePointer.getChunks().get(0).getChunkEnd();
+                }
+                if (nextStart == 0) {
+                    nextStart = BlockCompressedFilePointerUtil.makeFilePointer(inputBam.length()); // default to file length (in case of no reads)
                 }
-                indexer.finish(inputBam.length());
+                indexer.finish(nextStart, inputBam.length()); // nextStart is start of next record that would be added
                 bamIndexer.finish();
         } catch (final IOException e) {
             throw new UserException("Couldn't create splitting index", e);
@@ -153,11 +164,11 @@ private static void assertIsBam(final File inputBam) {
 
     private static File getOutputFile(final File suggestedOutput, final File input) {
         if(suggestedOutput == null){
-            return new File(input.getPath() + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION);
+            return new File(input.getPath() + SBIIndex.FILE_EXTENSION);
         } else {
-            if (!suggestedOutput.getAbsolutePath().endsWith("bam" + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION)){
-                logger.warn("Creating a splitting index with an extension that doesn't match "
-                        + "bam"+SplittingBAMIndexer.OUTPUT_FILE_EXTENSION + ".  Output file: "+suggestedOutput);
+            if (!suggestedOutput.getAbsolutePath().endsWith("bam" + SBIIndex.FILE_EXTENSION)){
+                logger.warn("Creating a splitting index (SBI) with an extension that doesn't match "
+                        + "bam"+SBIIndex.FILE_EXTENSION + ".  Output file: "+suggestedOutput);
             }
             return suggestedOutput;
         }
diff --git a/src/test/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSinkUnitTest.java b/src/test/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSinkUnitTest.java
index 8b4afbcc465..c31ce77b213 100644
--- a/src/test/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSinkUnitTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSinkUnitTest.java
@@ -4,6 +4,7 @@
 import htsjdk.samtools.SAMFileHeader;
 import htsjdk.samtools.SAMRecord;
 import htsjdk.samtools.SAMRecordCoordinateComparator;
+import htsjdk.samtools.SBIIndex;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -19,7 +20,6 @@
 import org.broadinstitute.hellbender.GATKBaseTest;
 import org.broadinstitute.hellbender.utils.test.MiniClusterUtils;
 import org.broadinstitute.hellbender.utils.test.ReadTestUtils;
-import org.seqdoop.hadoop_bam.SplittingBAMIndexer;
 import org.testng.Assert;
 import org.testng.annotations.AfterClass;
 import org.testng.annotations.BeforeClass;
@@ -155,7 +155,7 @@ private void assertSingleShardedWritingWorks(String inputBam, String referenceFi
 
         // check that a splitting bai file is created
         if (IOUtils.isBamFileName(outputPath)) {
-            //Assert.assertTrue(Files.exists(IOUtils.getPath(outputPath + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION)));
+            //Assert.assertTrue(Files.exists(IOUtils.getPath(outputPath + SBIIndex.FILE_EXTENSION)));
         }
 
         JavaRDD<GATKRead> rddParallelReads2 = readSource.getParallelReads(outputPath, referenceFile);
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndexIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndexIntegrationTest.java
index 83493248668..e242f74321f 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndexIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndexIntegrationTest.java
@@ -1,14 +1,13 @@
 package org.broadinstitute.hellbender.tools.spark;
 
 import htsjdk.samtools.BAMIndex;
+import htsjdk.samtools.SBIIndex;
 import htsjdk.samtools.util.IOUtil;
 import org.broadinstitute.barclay.argparser.CommandLineException;
 import org.broadinstitute.hellbender.CommandLineProgramTest;
 import org.broadinstitute.hellbender.exceptions.UserException;
 import org.broadinstitute.hellbender.utils.io.IOUtils;
 import org.broadinstitute.hellbender.utils.test.ArgumentsBuilder;
-import org.seqdoop.hadoop_bam.SplittingBAMIndex;
-import org.seqdoop.hadoop_bam.SplittingBAMIndexer;
 import org.testng.Assert;
 import org.testng.annotations.DataProvider;
 import org.testng.annotations.Test;
@@ -43,15 +42,15 @@ public void testCreateSplittingIndex(final File bam) throws IOException {
         assertIndexIsNotEmpty(splittingIndex);
 
         //checked in index created with
-        // java -cp target/hadoop-bam-7.4.1-SNAPSHOT-jar-with-dependencies.jar org.seqdoop.hadoop_bam.SplittingBAMIndexer  1 <filename>
-        final File expectedSplittingIndex = new File(bam.toPath() + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION);
+        // ./gatk CreateHadoopBamSplittingIndex --input <filename> --splitting-index-granularity 1
+        final File expectedSplittingIndex = new File(bam.toPath() + SBIIndex.FILE_EXTENSION);
 
         IOUtil.assertFilesEqual(splittingIndex, expectedSplittingIndex);
     }
 
     private static void assertIndexIsNotEmpty(final File splittingIndex) throws IOException {
         Assert.assertTrue(splittingIndex.exists());
-        final SplittingBAMIndex splittingBAMIndex = new SplittingBAMIndex(splittingIndex);
+        final SBIIndex splittingBAMIndex = SBIIndex.load(splittingIndex.toPath());
         Assert.assertTrue(splittingBAMIndex.size() > 0 );
     }
 
@@ -82,7 +81,7 @@ public void testUnspecifiedOutputProducesAdjacentIndex(final File bam) throws IO
         // we're going to write an index next to it on disk, and we don't want to write into the test resources folder
         final File bamCopy = createTempFile("copy-"+bam, ".bam");
         Files.copy(bam.toPath(), bamCopy.toPath(), StandardCopyOption.REPLACE_EXISTING);
-        final File expectedIndex = new File(bamCopy.toPath() + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION);
+        final File expectedIndex = new File(bamCopy.toPath() + SBIIndex.FILE_EXTENSION);
         Assert.assertFalse(expectedIndex.exists());
         final ArgumentsBuilder args = new ArgumentsBuilder().addInput(bamCopy);
         this.runCommandLine(args);
@@ -131,7 +130,7 @@ public void testCantCreateBaiForUnsortedFile(){
     }
 
     private static File getTempIndexFile() {
-        return createTempFile("index", "bam.splitting-bai");
+        return createTempFile("index", "bam" + SBIIndex.FILE_EXTENSION);
     }
 
 
diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex/count_reads.bam.sbi b/src/test/resources/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex/count_reads.bam.sbi
new file mode 100644
index 0000000000000000000000000000000000000000..2ef51a2d33a4221663051975734023372ef776af
GIT binary patch
literal 140
zcmWG`@?_k=#6T3_fU07I(wtB_04iV%HAjpYBJl%C-(rFAkFY}MRcsKtg&jgOFiZjK
F1OSg=2DShI

literal 0
HcmV?d00001

diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex/count_reads.bam.splitting-bai b/src/test/resources/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex/count_reads.bam.splitting-bai
index 70599a9157bf47973ede324026e01f614cc9f36f..5ffb6a0b93462c119ab5a3134c3272e12f042193 100644
GIT binary patch
literal 80
xcmWG`@?>OWU|<LU5kO$f1fj*4A@mO@eTxOcKf(&3SFu6p7Ip~Dz_0<V69D%R1`hxL

literal 80
tcmZQz00G7TFa@WXj3InxF(~~5LbKe0(5y!wG}|f&&E5i{nKm#m005`W1%Lnm

diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex/count_reads_sorted.bam.sbi b/src/test/resources/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex/count_reads_sorted.bam.sbi
new file mode 100644
index 0000000000000000000000000000000000000000..f4b7c5210d8328a90fa34c9c99e5770b3e2b0842
GIT binary patch
literal 140
zcmWG`@?_k_#6T3_fU07I(wtB_5Gr5{HAj{iBJmwc-(rFA_p(Ch`D_rnlN~}cFw6t%
F1OSxv2E+gW

literal 0
HcmV?d00001

diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex/count_reads_sorted.bam.splitting-bai b/src/test/resources/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex/count_reads_sorted.bam.splitting-bai
index cc71fa0c2bb5e1a07b483245758adf9f6a4c7d3c..a65b372a6b1cdbee65de544bf31c971b329c7aef 100644
GIT binary patch
literal 80
xcmWG`@?>OWU|<LY5kO$f1fgY_A@p}BeTxOc-^&W2=d(fRPId^*z_1If69D&A1`_}P

literal 80
tcmZQz00G88Fa@WXj3InxSt$J-LbKe0(5!nQG~0X#&E5&3nRYQS005|F1%&_r