Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for reading and writing splitting BAM index files. #1138

Merged
merged 7 commits into from
Dec 11, 2018
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/main/java/htsjdk/samtools/BAMFileReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,12 @@ static long findVirtualOffsetOfFirstRecord(final File bam) throws IOException {
return offset;
}

/** Reads through the header and sequence records to find the virtual file offset of the first record in the BAM file. */
static long findVirtualOffsetOfFirstRecord(final SeekableStream seekableStream) throws IOException {
final BAMFileReader reader = new BAMFileReader(seekableStream, (SeekableStream) null, false, false, ValidationStringency.SILENT, new DefaultSAMRecordFactory());
return reader.mFirstRecordPointer;
}

/**
* If true, writes the source of every read into the source SAMRecords.
* @param enabled true to write source information into each SAMRecord.
Expand Down Expand Up @@ -944,6 +950,10 @@ public CloseableIterator<SAMRecord> createIndexIterator(final QueryInterval[] in
return new BAMQueryFilteringIterator(iterator, new BAMQueryMultipleIntervalsIteratorFilter(intervals, contained));
}

public long getVirtualFilePointer() {
yfarjoun marked this conversation as resolved.
Show resolved Hide resolved
return mCompressedInputStream.getFilePointer();
}

/**
* Iterate over the SAMRecords defined by the sections of the file described in the ctor argument.
*/
Expand Down
64 changes: 64 additions & 0 deletions src/main/java/htsjdk/samtools/BAMSBIIndexer.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package htsjdk.samtools;
yfarjoun marked this conversation as resolved.
Show resolved Hide resolved

import htsjdk.samtools.cram.io.InputStreamUtils;
import htsjdk.samtools.seekablestream.SeekablePathStream;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.util.BlockCompressedInputStream;
import htsjdk.samtools.util.IOUtil;

import java.io.EOFException;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.file.Files;
import java.nio.file.Path;

/**
* Writes SBI files for BAM files, as understood by {@link SBIIndex}.
*/
public final class BAMSBIIndexer {

/**
* Perform indexing on the given BAM file, at the granularity level specified.
*
* @param bamFile the path to the BAM file
* @param granularity write the offset of every n-th alignment to the index
* @throws IOException as per java IO contract
*/
public static void createIndex(final Path bamFile, final long granularity) throws IOException {
Path splittingBaiFile = IOUtil.addExtension(bamFile, SBIIndex.FILE_EXTENSION);
try (SeekableStream in = new SeekablePathStream(bamFile); OutputStream out = Files.newOutputStream(splittingBaiFile)) {
createIndex(in, out, granularity);
}
}

/**
* Perform indexing on the given BAM file, at the granularity level specified.
*
* @param in a seekable stream for reading the BAM file from
* @param out the stream to write the index to
* @param granularity write the offset of every n-th alignment to the index
* @throws IOException as per java IO contract
*/
public static void createIndex(final SeekableStream in, final OutputStream out, final long granularity) throws IOException {
long recordStart = SAMUtils.findVirtualOffsetOfFirstRecordInBam(in);
try (BlockCompressedInputStream blockIn = new BlockCompressedInputStream(in)) {
blockIn.seek(recordStart);
final ByteBuffer byteBuffer = ByteBuffer.allocate(4).order(ByteOrder.LITTLE_ENDIAN); // BAM is little-endian
SBIIndexWriter indexWriter = new SBIIndexWriter(out, granularity);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

final

while (true) {
try {
recordStart = blockIn.getFilePointer();
InputStreamUtils.readFully(blockIn, byteBuffer.array(), 0, 4);
yfarjoun marked this conversation as resolved.
Show resolved Hide resolved
final int blockSize = byteBuffer.getInt(0); // length of remainder of alignment record
indexWriter.processRecord(recordStart);
InputStreamUtils.skipFully(blockIn, blockSize);
} catch (EOFException e) {
break;
}
}
indexWriter.finish(recordStart, in.length());
}
}
}
13 changes: 13 additions & 0 deletions src/main/java/htsjdk/samtools/SAMUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
*/
package htsjdk.samtools;

import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.util.BinaryCodec;
import htsjdk.samtools.util.CigarUtil;
import htsjdk.samtools.util.CloserUtil;
Expand Down Expand Up @@ -685,6 +686,18 @@ public static long findVirtualOffsetOfFirstRecordInBam(final File bamFile) {
}
}

/**
* Returns the virtual file offset of the first record in a BAM file - i.e. the virtual file
* offset after skipping over the text header and the sequence records.
*/
public static long findVirtualOffsetOfFirstRecordInBam(final SeekableStream seekableStream) {
try {
return BAMFileReader.findVirtualOffsetOfFirstRecord(seekableStream);
} catch (final IOException ioe) {
throw new RuntimeEOFException(ioe);
}
}

/**
* Given a Cigar, Returns blocks of the sequence that have been aligned directly to the
* reference sequence. Note that clipped portions, and inserted and deleted bases (vs. the reference)
Expand Down
Loading