From e37586002175fc76ae5403a8f14cd7f9bec4c4b1 Mon Sep 17 00:00:00 2001 From: Tom White Date: Wed, 30 May 2018 11:51:31 +0100 Subject: [PATCH] Use latest SBI code from htsjdk PR (https://github.com/samtools/htsjdk/pull/1138) --- build.gradle | 4 -- .../spark/CreateHadoopBamSplittingIndex.java | 49 +++++++++++------- .../datasources/ReadsSparkSinkUnitTest.java | 4 +- ...adoopBamSplittingIndexIntegrationTest.java | 13 +++-- .../count_reads.bam.sbi | Bin 0 -> 140 bytes .../count_reads.bam.splitting-bai | Bin 80 -> 80 bytes .../count_reads_sorted.bam.sbi | Bin 0 -> 140 bytes .../count_reads_sorted.bam.splitting-bai | Bin 80 -> 80 bytes 8 files changed, 38 insertions(+), 32 deletions(-) create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex/count_reads.bam.sbi create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex/count_reads_sorted.bam.sbi diff --git a/build.gradle b/build.gradle index db369118cf1..1783b156d01 100644 --- a/build.gradle +++ b/build.gradle @@ -282,10 +282,6 @@ dependencies { compile 'org.testng:testng:' + testNGVersion //compile instead of testCompile because it is needed for test infrastructure that needs to be packaged compile 'org.apache.hadoop:hadoop-minicluster:' + hadoopVersion - compile('org.seqdoop:hadoop-bam:' + hadoopBamVersion) { - exclude group: 'org.apache.hadoop' - exclude module: 'htsjdk' - } compile files('lib/disq-0.0.1-SNAPSHOT.jar') compile('org.apache.hadoop:hadoop-client:' + hadoopVersion) // should be a 'provided' dependency compile('com.github.jsr203hadoop:jsr203hadoop:1.0.3') diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java index 08fa0a87d9d..3841f9ca313 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java @@ -1,6 +1,10 @@ package org.broadinstitute.hellbender.tools.spark; import htsjdk.samtools.*; +import htsjdk.samtools.BAMSBIIndexer; +import htsjdk.samtools.seekablestream.SeekableFileStream; +import htsjdk.samtools.seekablestream.SeekableStream; +import htsjdk.samtools.util.BlockCompressedFilePointerUtil; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.broadinstitute.barclay.argparser.Argument; @@ -14,7 +18,6 @@ import org.broadinstitute.hellbender.utils.io.IOUtils; import org.broadinstitute.hellbender.utils.read.ReadConstants; import org.codehaus.plexus.util.FileUtils; -import org.seqdoop.hadoop_bam.SplittingBAMIndexer; import picard.cmdline.programgroups.OtherProgramGroup; import java.io.*; @@ -70,15 +73,15 @@ public final class CreateHadoopBamSplittingIndex extends CommandLineProgram { @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, - doc = "The BAM splitting_index file. If this is unspecified an index will be created with the same name as " + - "the input file but with the additional extension " + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION, + doc = "The splitting index (SBI) file. If this is unspecified an index will be created with the same name as " + + "the input file but with the additional extension " + SBIIndex.FILE_EXTENSION, optional = true) public File output; @Argument(fullName = SPLITTING_INDEX_GRANULARITY_LONG_NAME, doc = "Splitting index granularity, an entry is created in the index every this many reads.", optional = true) - public int granularity = SplittingBAMIndexer.DEFAULT_GRANULARITY; + public long granularity = SBIIndexWriter.DEFAULT_GRANULARITY; @Argument(fullName = CREATE_BAI_LONG_NAME, doc = "Set this to create a bai index at the same time as creating a splitting index", @@ -89,7 +92,7 @@ public final class CreateHadoopBamSplittingIndex extends CommandLineProgram { @Override public Object doWork() { if( granularity <= 0) { - throw new CommandLineException.BadArgumentValue(SPLITTING_INDEX_GRANULARITY_LONG_NAME, Integer.toString(granularity), "Granularity must be > 0"); + throw new CommandLineException.BadArgumentValue(SPLITTING_INDEX_GRANULARITY_LONG_NAME, Long.toString(granularity), "Granularity must be > 0"); } final File index = getOutputFile(output, inputBam); if(createBai){ @@ -101,19 +104,17 @@ public Object doWork() { return 0; } - private static void createOnlySplittingIndex(final File inputBam, final File index, final int granularity) { + private static void createOnlySplittingIndex(final File inputBam, final File index, final long granularity) { assertIsBam(inputBam); - //createBamSplittingIndex(inputBam, getOutputFile(output, inputBam), readValidationStringency, granularity); - try(BufferedInputStream in = new BufferedInputStream(new FileInputStream(inputBam)); - BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(index))) { - SplittingBAMIndexer.index(in, out, inputBam.length(), granularity); - + try(SeekableStream in = new SeekableFileStream(inputBam); + BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(index))) { + BAMSBIIndexer.createIndex(in, out, granularity); } catch (final IOException e) { throw new UserException("Couldn't create splitting index", e); } } - private static void createBaiAndSplittingIndex(final File inputBam, final File index, final int granularity, final ValidationStringency readValidationStringency) { + private static void createBaiAndSplittingIndex(final File inputBam, final File index, final long granularity, final ValidationStringency readValidationStringency) { assertIsBam(inputBam); try(SamReader reader = SamReaderFactory.makeDefault() .validationStringency(readValidationStringency) @@ -122,14 +123,24 @@ private static void createBaiAndSplittingIndex(final File inputBam, final File i BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(index))) { final SAMFileHeader header = reader.getFileHeader(); assertBamIsCoordinateSorted(header); - final SplittingBAMIndexer indexer = new SplittingBAMIndexer(out, granularity); + final SBIIndexWriter indexer = new SBIIndexWriter(out, granularity); final BAMIndexer bamIndexer = new BAMIndexer(IOUtils.replaceExtension(index, BAMIndex.BAMIndexSuffix), header); + BAMFileSpan lastFilePointer = null; for(final SAMRecord read : reader){ - indexer.processAlignment(read); + BAMFileSpan filePointer = (BAMFileSpan) read.getFileSource().getFilePointer(); + indexer.processRecord(filePointer.getFirstOffset()); bamIndexer.processAlignment(read); + lastFilePointer = filePointer; + } + long nextStart = 0; + if (lastFilePointer != null && !lastFilePointer.getChunks().isEmpty()) { + nextStart = lastFilePointer.getChunks().get(0).getChunkEnd(); + } + if (nextStart == 0) { + nextStart = BlockCompressedFilePointerUtil.makeFilePointer(inputBam.length()); // default to file length (in case of no reads) } - indexer.finish(inputBam.length()); + indexer.finish(nextStart, inputBam.length()); // nextStart is start of next record that would be added bamIndexer.finish(); } catch (final IOException e) { throw new UserException("Couldn't create splitting index", e); @@ -153,11 +164,11 @@ private static void assertIsBam(final File inputBam) { private static File getOutputFile(final File suggestedOutput, final File input) { if(suggestedOutput == null){ - return new File(input.getPath() + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION); + return new File(input.getPath() + SBIIndex.FILE_EXTENSION); } else { - if (!suggestedOutput.getAbsolutePath().endsWith("bam" + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION)){ - logger.warn("Creating a splitting index with an extension that doesn't match " - + "bam"+SplittingBAMIndexer.OUTPUT_FILE_EXTENSION + ". Output file: "+suggestedOutput); + if (!suggestedOutput.getAbsolutePath().endsWith("bam" + SBIIndex.FILE_EXTENSION)){ + logger.warn("Creating a splitting index (SBI) with an extension that doesn't match " + + "bam"+SBIIndex.FILE_EXTENSION + ". Output file: "+suggestedOutput); } return suggestedOutput; } diff --git a/src/test/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSinkUnitTest.java b/src/test/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSinkUnitTest.java index 8b4afbcc465..c31ce77b213 100644 --- a/src/test/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSinkUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSinkUnitTest.java @@ -4,6 +4,7 @@ import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordCoordinateComparator; +import htsjdk.samtools.SBIIndex; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -19,7 +20,6 @@ import org.broadinstitute.hellbender.GATKBaseTest; import org.broadinstitute.hellbender.utils.test.MiniClusterUtils; import org.broadinstitute.hellbender.utils.test.ReadTestUtils; -import org.seqdoop.hadoop_bam.SplittingBAMIndexer; import org.testng.Assert; import org.testng.annotations.AfterClass; import org.testng.annotations.BeforeClass; @@ -155,7 +155,7 @@ private void assertSingleShardedWritingWorks(String inputBam, String referenceFi // check that a splitting bai file is created if (IOUtils.isBamFileName(outputPath)) { - //Assert.assertTrue(Files.exists(IOUtils.getPath(outputPath + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION))); + //Assert.assertTrue(Files.exists(IOUtils.getPath(outputPath + SBIIndex.FILE_EXTENSION))); } JavaRDD rddParallelReads2 = readSource.getParallelReads(outputPath, referenceFile); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndexIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndexIntegrationTest.java index 83493248668..e242f74321f 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndexIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndexIntegrationTest.java @@ -1,14 +1,13 @@ package org.broadinstitute.hellbender.tools.spark; import htsjdk.samtools.BAMIndex; +import htsjdk.samtools.SBIIndex; import htsjdk.samtools.util.IOUtil; import org.broadinstitute.barclay.argparser.CommandLineException; import org.broadinstitute.hellbender.CommandLineProgramTest; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.utils.io.IOUtils; import org.broadinstitute.hellbender.utils.test.ArgumentsBuilder; -import org.seqdoop.hadoop_bam.SplittingBAMIndex; -import org.seqdoop.hadoop_bam.SplittingBAMIndexer; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -43,15 +42,15 @@ public void testCreateSplittingIndex(final File bam) throws IOException { assertIndexIsNotEmpty(splittingIndex); //checked in index created with - // java -cp target/hadoop-bam-7.4.1-SNAPSHOT-jar-with-dependencies.jar org.seqdoop.hadoop_bam.SplittingBAMIndexer 1 - final File expectedSplittingIndex = new File(bam.toPath() + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION); + // ./gatk CreateHadoopBamSplittingIndex --input --splitting-index-granularity 1 + final File expectedSplittingIndex = new File(bam.toPath() + SBIIndex.FILE_EXTENSION); IOUtil.assertFilesEqual(splittingIndex, expectedSplittingIndex); } private static void assertIndexIsNotEmpty(final File splittingIndex) throws IOException { Assert.assertTrue(splittingIndex.exists()); - final SplittingBAMIndex splittingBAMIndex = new SplittingBAMIndex(splittingIndex); + final SBIIndex splittingBAMIndex = SBIIndex.load(splittingIndex.toPath()); Assert.assertTrue(splittingBAMIndex.size() > 0 ); } @@ -82,7 +81,7 @@ public void testUnspecifiedOutputProducesAdjacentIndex(final File bam) throws IO // we're going to write an index next to it on disk, and we don't want to write into the test resources folder final File bamCopy = createTempFile("copy-"+bam, ".bam"); Files.copy(bam.toPath(), bamCopy.toPath(), StandardCopyOption.REPLACE_EXISTING); - final File expectedIndex = new File(bamCopy.toPath() + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION); + final File expectedIndex = new File(bamCopy.toPath() + SBIIndex.FILE_EXTENSION); Assert.assertFalse(expectedIndex.exists()); final ArgumentsBuilder args = new ArgumentsBuilder().addInput(bamCopy); this.runCommandLine(args); @@ -131,7 +130,7 @@ public void testCantCreateBaiForUnsortedFile(){ } private static File getTempIndexFile() { - return createTempFile("index", "bam.splitting-bai"); + return createTempFile("index", "bam" + SBIIndex.FILE_EXTENSION); } diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex/count_reads.bam.sbi b/src/test/resources/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex/count_reads.bam.sbi new file mode 100644 index 0000000000000000000000000000000000000000..2ef51a2d33a4221663051975734023372ef776af GIT binary patch literal 140 zcmWG`@?_k=#6T3_fU07I(wtB_04iV%HAjpYBJl%C-(rFAkFY}MRcsKtg&jgOFiZjK F1OSg=2DShI literal 0 HcmV?d00001 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex/count_reads.bam.splitting-bai b/src/test/resources/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex/count_reads.bam.splitting-bai index 70599a9157bf47973ede324026e01f614cc9f36f..5ffb6a0b93462c119ab5a3134c3272e12f042193 100644 GIT binary patch literal 80 xcmWG`@?>OWU|OWU|