Skip to content

Commit

Permalink
Use latest SBI code from htsjdk PR (samtools/htsjdk#1138)
Browse files Browse the repository at this point in the history
  • Loading branch information
tomwhite committed Jul 26, 2018
1 parent 0bda7b8 commit e375860
Show file tree
Hide file tree
Showing 8 changed files with 38 additions and 32 deletions.
4 changes: 0 additions & 4 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -282,10 +282,6 @@ dependencies {
compile 'org.testng:testng:' + testNGVersion //compile instead of testCompile because it is needed for test infrastructure that needs to be packaged
compile 'org.apache.hadoop:hadoop-minicluster:' + hadoopVersion

compile('org.seqdoop:hadoop-bam:' + hadoopBamVersion) {
exclude group: 'org.apache.hadoop'
exclude module: 'htsjdk'
}
compile files('lib/disq-0.0.1-SNAPSHOT.jar')
compile('org.apache.hadoop:hadoop-client:' + hadoopVersion) // should be a 'provided' dependency
compile('com.github.jsr203hadoop:jsr203hadoop:1.0.3')
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
package org.broadinstitute.hellbender.tools.spark;

import htsjdk.samtools.*;
import htsjdk.samtools.BAMSBIIndexer;
import htsjdk.samtools.seekablestream.SeekableFileStream;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.util.BlockCompressedFilePointerUtil;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.Argument;
Expand All @@ -14,7 +18,6 @@
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.read.ReadConstants;
import org.codehaus.plexus.util.FileUtils;
import org.seqdoop.hadoop_bam.SplittingBAMIndexer;
import picard.cmdline.programgroups.OtherProgramGroup;

import java.io.*;
Expand Down Expand Up @@ -70,15 +73,15 @@ public final class CreateHadoopBamSplittingIndex extends CommandLineProgram {

@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
doc = "The BAM splitting_index file. If this is unspecified an index will be created with the same name as " +
"the input file but with the additional extension " + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION,
doc = "The splitting index (SBI) file. If this is unspecified an index will be created with the same name as " +
"the input file but with the additional extension " + SBIIndex.FILE_EXTENSION,
optional = true)
public File output;

@Argument(fullName = SPLITTING_INDEX_GRANULARITY_LONG_NAME,
doc = "Splitting index granularity, an entry is created in the index every this many reads.",
optional = true)
public int granularity = SplittingBAMIndexer.DEFAULT_GRANULARITY;
public long granularity = SBIIndexWriter.DEFAULT_GRANULARITY;

@Argument(fullName = CREATE_BAI_LONG_NAME,
doc = "Set this to create a bai index at the same time as creating a splitting index",
Expand All @@ -89,7 +92,7 @@ public final class CreateHadoopBamSplittingIndex extends CommandLineProgram {
@Override
public Object doWork() {
if( granularity <= 0) {
throw new CommandLineException.BadArgumentValue(SPLITTING_INDEX_GRANULARITY_LONG_NAME, Integer.toString(granularity), "Granularity must be > 0");
throw new CommandLineException.BadArgumentValue(SPLITTING_INDEX_GRANULARITY_LONG_NAME, Long.toString(granularity), "Granularity must be > 0");
}
final File index = getOutputFile(output, inputBam);
if(createBai){
Expand All @@ -101,19 +104,17 @@ public Object doWork() {
return 0;
}

private static void createOnlySplittingIndex(final File inputBam, final File index, final int granularity) {
private static void createOnlySplittingIndex(final File inputBam, final File index, final long granularity) {
assertIsBam(inputBam);
//createBamSplittingIndex(inputBam, getOutputFile(output, inputBam), readValidationStringency, granularity);
try(BufferedInputStream in = new BufferedInputStream(new FileInputStream(inputBam));
BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(index))) {
SplittingBAMIndexer.index(in, out, inputBam.length(), granularity);

try(SeekableStream in = new SeekableFileStream(inputBam);
BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(index))) {
BAMSBIIndexer.createIndex(in, out, granularity);
} catch (final IOException e) {
throw new UserException("Couldn't create splitting index", e);
}
}

private static void createBaiAndSplittingIndex(final File inputBam, final File index, final int granularity, final ValidationStringency readValidationStringency) {
private static void createBaiAndSplittingIndex(final File inputBam, final File index, final long granularity, final ValidationStringency readValidationStringency) {
assertIsBam(inputBam);
try(SamReader reader = SamReaderFactory.makeDefault()
.validationStringency(readValidationStringency)
Expand All @@ -122,14 +123,24 @@ private static void createBaiAndSplittingIndex(final File inputBam, final File i
BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(index))) {
final SAMFileHeader header = reader.getFileHeader();
assertBamIsCoordinateSorted(header);
final SplittingBAMIndexer indexer = new SplittingBAMIndexer(out, granularity);
final SBIIndexWriter indexer = new SBIIndexWriter(out, granularity);

final BAMIndexer bamIndexer = new BAMIndexer(IOUtils.replaceExtension(index, BAMIndex.BAMIndexSuffix), header);
BAMFileSpan lastFilePointer = null;
for(final SAMRecord read : reader){
indexer.processAlignment(read);
BAMFileSpan filePointer = (BAMFileSpan) read.getFileSource().getFilePointer();
indexer.processRecord(filePointer.getFirstOffset());
bamIndexer.processAlignment(read);
lastFilePointer = filePointer;
}
long nextStart = 0;
if (lastFilePointer != null && !lastFilePointer.getChunks().isEmpty()) {
nextStart = lastFilePointer.getChunks().get(0).getChunkEnd();
}
if (nextStart == 0) {
nextStart = BlockCompressedFilePointerUtil.makeFilePointer(inputBam.length()); // default to file length (in case of no reads)
}
indexer.finish(inputBam.length());
indexer.finish(nextStart, inputBam.length()); // nextStart is start of next record that would be added
bamIndexer.finish();
} catch (final IOException e) {
throw new UserException("Couldn't create splitting index", e);
Expand All @@ -153,11 +164,11 @@ private static void assertIsBam(final File inputBam) {

private static File getOutputFile(final File suggestedOutput, final File input) {
if(suggestedOutput == null){
return new File(input.getPath() + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION);
return new File(input.getPath() + SBIIndex.FILE_EXTENSION);
} else {
if (!suggestedOutput.getAbsolutePath().endsWith("bam" + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION)){
logger.warn("Creating a splitting index with an extension that doesn't match "
+ "bam"+SplittingBAMIndexer.OUTPUT_FILE_EXTENSION + ". Output file: "+suggestedOutput);
if (!suggestedOutput.getAbsolutePath().endsWith("bam" + SBIIndex.FILE_EXTENSION)){
logger.warn("Creating a splitting index (SBI) with an extension that doesn't match "
+ "bam"+SBIIndex.FILE_EXTENSION + ". Output file: "+suggestedOutput);
}
return suggestedOutput;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordCoordinateComparator;
import htsjdk.samtools.SBIIndex;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
Expand All @@ -19,7 +20,6 @@
import org.broadinstitute.hellbender.GATKBaseTest;
import org.broadinstitute.hellbender.utils.test.MiniClusterUtils;
import org.broadinstitute.hellbender.utils.test.ReadTestUtils;
import org.seqdoop.hadoop_bam.SplittingBAMIndexer;
import org.testng.Assert;
import org.testng.annotations.AfterClass;
import org.testng.annotations.BeforeClass;
Expand Down Expand Up @@ -155,7 +155,7 @@ private void assertSingleShardedWritingWorks(String inputBam, String referenceFi

// check that a splitting bai file is created
if (IOUtils.isBamFileName(outputPath)) {
//Assert.assertTrue(Files.exists(IOUtils.getPath(outputPath + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION)));
//Assert.assertTrue(Files.exists(IOUtils.getPath(outputPath + SBIIndex.FILE_EXTENSION)));
}

JavaRDD<GATKRead> rddParallelReads2 = readSource.getParallelReads(outputPath, referenceFile);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
package org.broadinstitute.hellbender.tools.spark;

import htsjdk.samtools.BAMIndex;
import htsjdk.samtools.SBIIndex;
import htsjdk.samtools.util.IOUtil;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.test.ArgumentsBuilder;
import org.seqdoop.hadoop_bam.SplittingBAMIndex;
import org.seqdoop.hadoop_bam.SplittingBAMIndexer;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
Expand Down Expand Up @@ -43,15 +42,15 @@ public void testCreateSplittingIndex(final File bam) throws IOException {
assertIndexIsNotEmpty(splittingIndex);

//checked in index created with
// java -cp target/hadoop-bam-7.4.1-SNAPSHOT-jar-with-dependencies.jar org.seqdoop.hadoop_bam.SplittingBAMIndexer 1 <filename>
final File expectedSplittingIndex = new File(bam.toPath() + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION);
// ./gatk CreateHadoopBamSplittingIndex --input <filename> --splitting-index-granularity 1
final File expectedSplittingIndex = new File(bam.toPath() + SBIIndex.FILE_EXTENSION);

IOUtil.assertFilesEqual(splittingIndex, expectedSplittingIndex);
}

private static void assertIndexIsNotEmpty(final File splittingIndex) throws IOException {
Assert.assertTrue(splittingIndex.exists());
final SplittingBAMIndex splittingBAMIndex = new SplittingBAMIndex(splittingIndex);
final SBIIndex splittingBAMIndex = SBIIndex.load(splittingIndex.toPath());
Assert.assertTrue(splittingBAMIndex.size() > 0 );
}

Expand Down Expand Up @@ -82,7 +81,7 @@ public void testUnspecifiedOutputProducesAdjacentIndex(final File bam) throws IO
// we're going to write an index next to it on disk, and we don't want to write into the test resources folder
final File bamCopy = createTempFile("copy-"+bam, ".bam");
Files.copy(bam.toPath(), bamCopy.toPath(), StandardCopyOption.REPLACE_EXISTING);
final File expectedIndex = new File(bamCopy.toPath() + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION);
final File expectedIndex = new File(bamCopy.toPath() + SBIIndex.FILE_EXTENSION);
Assert.assertFalse(expectedIndex.exists());
final ArgumentsBuilder args = new ArgumentsBuilder().addInput(bamCopy);
this.runCommandLine(args);
Expand Down Expand Up @@ -131,7 +130,7 @@ public void testCantCreateBaiForUnsortedFile(){
}

private static File getTempIndexFile() {
return createTempFile("index", "bam.splitting-bai");
return createTempFile("index", "bam" + SBIIndex.FILE_EXTENSION);
}


Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit e375860

Please sign in to comment.