Skip to content

Commit

Permalink
Add support for reading and writing splitting BAM index files.
Browse files Browse the repository at this point in the history
  • Loading branch information
tomwhite committed Jun 4, 2018
1 parent b2bfb32 commit 75e30e3
Show file tree
Hide file tree
Showing 7 changed files with 531 additions and 0 deletions.
6 changes: 6 additions & 0 deletions src/main/java/htsjdk/samtools/BAMFileReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,12 @@ static long findVirtualOffsetOfFirstRecord(final File bam) throws IOException {
return offset;
}

/** Reads through the header and sequence records to find the virtual file offset of the first record in the BAM file. */
static long findVirtualOffsetOfFirstRecord(final SeekableStream seekableStream) throws IOException {
final BAMFileReader reader = new BAMFileReader(seekableStream, (SeekableStream) null, false, false, ValidationStringency.SILENT, new DefaultSAMRecordFactory());
return reader.mFirstRecordPointer;
}

/**
* If true, writes the source of every read into the source SAMRecords.
* @param enabled true to write source information into each SAMRecord.
Expand Down
13 changes: 13 additions & 0 deletions src/main/java/htsjdk/samtools/SAMUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
*/
package htsjdk.samtools;

import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.util.BinaryCodec;
import htsjdk.samtools.util.CigarUtil;
import htsjdk.samtools.util.CloserUtil;
Expand Down Expand Up @@ -685,6 +686,18 @@ public static long findVirtualOffsetOfFirstRecordInBam(final File bamFile) {
}
}

/**
* Returns the virtual file offset of the first record in a BAM file - i.e. the virtual file
* offset after skipping over the text header and the sequence records.
*/
public static long findVirtualOffsetOfFirstRecordInBam(final SeekableStream seekableStream) {
try {
return BAMFileReader.findVirtualOffsetOfFirstRecord(seekableStream);
} catch (final IOException ioe) {
throw new RuntimeEOFException(ioe);
}
}

/**
* Given a Cigar, Returns blocks of the sequence that have been aligned directly to the
* reference sequence. Note that clipped portions, and inserted and deleted bases (vs. the reference)
Expand Down
213 changes: 213 additions & 0 deletions src/main/java/htsjdk/samtools/SplittingBAMIndex.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
package htsjdk.samtools;

import htsjdk.samtools.util.BinaryCodec;
import htsjdk.samtools.util.BlockCompressedFilePointerUtil;
import htsjdk.samtools.util.RuntimeEOFException;
import htsjdk.samtools.util.RuntimeIOException;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.NavigableSet;
import java.util.TreeSet;

/**
* An index into BAM files, which records the file position of the start of every <i>n</i>th record.
* Reads files that are created by {@link SplittingBAMIndexer}.
*
* <p>Indexes the positions of individual BAM records in the file.</p>
*/
public final class SplittingBAMIndex {
public static final String FILE_EXTENSION = ".sbi";

/**
* Splitting BAM index file magic number.
*/
static final byte[] SPLITTING_BAM_INDEX_MAGIC = "SBI\1".getBytes();

private final int granularity;
private final NavigableSet<Long> virtualOffsets = new TreeSet<>();

/**
* Create an in-memory splitting BAM index with the given virtual offsets.
* @param virtualOffsets the offsets in the index
*/
public SplittingBAMIndex(final int granularity, final NavigableSet<Long> virtualOffsets) {
this.granularity = granularity;
this.virtualOffsets.addAll(virtualOffsets);
if (virtualOffsets.isEmpty()) {
throw new RuntimeIOException("Invalid splitting BAM index: should contain at least the file size");
}
}

/**
* Load a splitting BAM index from a path.
* @param path the path to the splitting BAM index
* @throws IOException as per java IO contract
*/
public static SplittingBAMIndex load(final Path path) throws IOException {
try (InputStream in = new BufferedInputStream(Files.newInputStream(path))) {
return readIndex(in);
}
}

/**
* Load a splitting BAM index from a stream.
* @param in the stream to read the splitting BAM index from
*/
public static SplittingBAMIndex load(final InputStream in) {
return readIndex(in);
}

private static SplittingBAMIndex readIndex(final InputStream in) {
BinaryCodec binaryCodec = new BinaryCodec(in);
int granularity = readHeader(binaryCodec);
NavigableSet<Long> virtualOffsets = new TreeSet<>();
try {
long prev = -1;
while (true) {
long cur = binaryCodec.readLong();
if (prev > cur) {
throw new RuntimeIOException(String.format(
"Invalid splitting BAM index; offsets not in order: %#x > %#x",
prev, cur));
}
virtualOffsets.add(prev = cur);
}
} catch (RuntimeEOFException e) {
// signals end of index
}
return new SplittingBAMIndex(granularity, virtualOffsets);
}

private static int readHeader(BinaryCodec binaryCodec) {
final byte[] buffer = new byte[SPLITTING_BAM_INDEX_MAGIC.length];
binaryCodec.readBytes(buffer);
if (!Arrays.equals(buffer, SPLITTING_BAM_INDEX_MAGIC)) {
throw new RuntimeIOException("Invalid file header in splitting BAM index: " + new String(buffer) + " (" + Arrays.toString(buffer) + ")");
}
int granularity = binaryCodec.readInt();
return granularity;
}

/**
* Returns the granularity of the index, that is the number of alignments between subsequent entries in the index.
* @return the granularity of the index
*/
public int getGranularity() {
return granularity;
}

/**
* Returns the entries in the index.
*
* @return a set of file pointers for all the alignment offsets in the index, in ascending order. The last
* virtual file pointer is the (virtual) length of the file.
*/
public NavigableSet<Long> getVirtualOffsets() {
return new TreeSet<>(virtualOffsets);
}

/**
* Returns number of entries in the index.
*
* @return the number of virtual offsets in the index
*/
public int size() {
return virtualOffsets.size();
}

/**
* Returns size of the BAM file, in bytes.
*
* @return the length of the BAM file, in bytes
*/
public long bamSize() {
return BlockCompressedFilePointerUtil.getBlockAddress(virtualOffsets.last());
}

/**
* Split the BAM file for this index into non-overlapping chunks of roughly the given size that cover the whole
* file and that can be read independently of one another.
*
* @param splitSize the rough size of each split in bytes
* @return a list of contiguous, non-overlapping, sorted chunks that cover the whole BAM file
* @see #getChunk(long, long)
*/
public List<Chunk> split(int splitSize) {
if (splitSize <= 0) {
throw new IllegalArgumentException(String.format("Split size must be positive: %s", splitSize));
}
long fileSize = bamSize();
List<Chunk> chunks = new ArrayList<>();
for (int splitStart = 0; splitStart < fileSize; splitStart += splitSize) {
Chunk chunk = getChunk(splitStart, splitStart + splitSize);
if (chunk != null) {
chunks.add(chunk);
}
}
return chunks;
}

/**
* Return a chunk that corresponds to the given range in the BAM file. Note that the chunk does not necessarily
* completely cover the given range, however this method will map a set of contiguous, non-overlapping file ranges
* that cover the whole BAM file to a set of contiguous, non-overlapping chunks that cover the whole file.
*
* @param splitStart the start of the file range (inclusive)
* @param splitEnd the start of the file range (exclusive)
* @return a chunk whose virtual start is at the first alignment start position that is greater than or equal to the
* given split start position, and whose virtual end is at the first alignment start position that is greater than
* or equal to the given split end position, or null if the chunk would be empty.
* @see #split(int)
*/
public Chunk getChunk(long splitStart, long splitEnd) {
if (splitStart >= splitEnd) {
throw new IllegalArgumentException(String.format("Split start (%s) must be less than end (%s)", splitStart, splitEnd));
}
long fileSize = bamSize();
if (splitEnd > fileSize) {
splitEnd = fileSize;
}
long virtualSplitStart = BlockCompressedFilePointerUtil.makeFilePointer(splitStart);
long virtualSplitEnd = BlockCompressedFilePointerUtil.makeFilePointer(splitEnd);
Long virtualSplitStartAlignment = virtualOffsets.ceiling(virtualSplitStart);
Long virtualSplitEndAlignment = virtualOffsets.ceiling(virtualSplitEnd);
// neither virtualSplitStartAlignment nor virtualSplitEndAlignment should ever be null, but check anyway
if (virtualSplitStartAlignment == null) {
throw new IllegalArgumentException(String.format("No virtual offset found for virtual file pointer %s, last virtual offset %s", virtualSplitStart, virtualOffsets.last()));
}
if (virtualSplitEndAlignment == null) {
throw new IllegalArgumentException(String.format("No virtual offset found for virtual file pointer %s, last virtual offset %s", virtualSplitEnd, virtualOffsets.last()));
}
if (virtualSplitStartAlignment.longValue() == virtualSplitEndAlignment.longValue()) {
return null;
}
return new Chunk(virtualSplitStartAlignment, virtualSplitEndAlignment);
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;

SplittingBAMIndex that = (SplittingBAMIndex) o;

return virtualOffsets.equals(that.virtualOffsets);
}

@Override
public int hashCode() {
return virtualOffsets.hashCode();
}

@Override
public String toString() {
return virtualOffsets.toString();
}
}
Loading

0 comments on commit 75e30e3

Please sign in to comment.