diff --git a/src/main/java/htsjdk/samtools/AbstractBAMFileIndex.java b/src/main/java/htsjdk/samtools/AbstractBAMFileIndex.java index 724e73c627..29f390172a 100644 --- a/src/main/java/htsjdk/samtools/AbstractBAMFileIndex.java +++ b/src/main/java/htsjdk/samtools/AbstractBAMFileIndex.java @@ -27,13 +27,6 @@ import htsjdk.samtools.util.RuntimeIOException; import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.RandomAccessFile; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.MappedByteBuffer; -import java.nio.channels.FileChannel; import java.util.ArrayList; import java.util.Arrays; import java.util.BitSet; @@ -55,34 +48,27 @@ public abstract class AbstractBAMFileIndex implements BAMIndex { private final IndexFileBuffer mIndexBuffer; - private SAMSequenceDictionary mBamDictionary = null; - - final int [] sequenceIndexes; - - protected AbstractBAMFileIndex( - final SeekableStream stream, final SAMSequenceDictionary dictionary) - { - mBamDictionary = dictionary; - mIndexBuffer = new IndexStreamBuffer(stream); + private final SAMSequenceDictionary mBamDictionary; - verifyBAMMagicNumber(stream.getSource()); + long[] sequenceIndexes; - sequenceIndexes = new int[readInteger() + 1]; - Arrays.fill(sequenceIndexes, -1); + protected AbstractBAMFileIndex(final SeekableStream stream, final SAMSequenceDictionary dictionary) { + this(new IndexStreamBuffer(stream), stream.getSource(), dictionary); } protected AbstractBAMFileIndex(final File file, final SAMSequenceDictionary dictionary) { - this(file, dictionary, true); + this(new MemoryMappedFileBuffer(file), file.getName(), dictionary); } protected AbstractBAMFileIndex(final File file, final SAMSequenceDictionary dictionary, final boolean useMemoryMapping) { - mBamDictionary = dictionary; - mIndexBuffer = (useMemoryMapping ? new MemoryMappedFileBuffer(file) : new RandomAccessFileBuffer(file)); - - verifyBAMMagicNumber(file.getName()); + this((useMemoryMapping ? new MemoryMappedFileBuffer(file) : new RandomAccessFileBuffer(file)), file.getName(), dictionary); + } - sequenceIndexes = new int[readInteger() + 1]; - Arrays.fill(sequenceIndexes, -1); + protected AbstractBAMFileIndex(final IndexFileBuffer indexFileBuffer, final String source, final SAMSequenceDictionary dictionary) { + mIndexBuffer = indexFileBuffer; + mBamDictionary = dictionary; + verifyIndexMagicNumber(source); + initParameters(); } /** @@ -101,12 +87,20 @@ public static int getNumIndexLevels() { return GenomicIndexUtil.LEVEL_STARTS.length; } + private static void assertLevelIsValid (final int levelNumber) { + if (levelNumber >= getNumIndexLevels()) { + throw new SAMException("Level number (" + levelNumber + ") is greater than or equal to maximum (" + getNumIndexLevels() + ")."); + } + } + /** * Gets the first bin in the given level. * @param levelNumber Level number. 0-based. * @return The first bin in this level. */ public static int getFirstBinInLevel(final int levelNumber) { + assertLevelIsValid(levelNumber); + return GenomicIndexUtil.LEVEL_STARTS[levelNumber]; } @@ -116,10 +110,13 @@ public static int getFirstBinInLevel(final int levelNumber) { * @return The size (number of possible bins) of the given level. */ public int getLevelSize(final int levelNumber) { - if(levelNumber == getNumIndexLevels()) - return GenomicIndexUtil.MAX_BINS+1-GenomicIndexUtil.LEVEL_STARTS[levelNumber]; - else - return GenomicIndexUtil.LEVEL_STARTS[levelNumber+1]-GenomicIndexUtil.LEVEL_STARTS[levelNumber]; + assertLevelIsValid(levelNumber); + + if (levelNumber == getNumIndexLevels()-1) { + return GenomicIndexUtil.MAX_BINS - GenomicIndexUtil.LEVEL_STARTS[levelNumber] - 1; + } else { + return GenomicIndexUtil.LEVEL_STARTS[levelNumber + 1] - GenomicIndexUtil.LEVEL_STARTS[levelNumber]; + } } /** @@ -227,11 +224,7 @@ public BAMIndexMetaData getMetaData(final int reference) { final int indexBin = readInteger(); final int nChunks = readInteger(); if (indexBin == GenomicIndexUtil.MAX_BINS) { - for (int ci = 0; ci < nChunks; ci++) { - final long chunkBegin = readLong(); - final long chunkEnd = readLong(); - metaDataChunks.add(new Chunk(chunkBegin, chunkEnd)); - } + readChunks(nChunks, metaDataChunks); } else { skipBytes(16 * nChunks); } @@ -287,21 +280,11 @@ protected BAMIndexContent query(final int referenceSequence, final int startPos, Chunk lastChunk = null; if (regionBins.get(indexBin)) { chunks = new ArrayList(nChunks); - for (int ci = 0; ci < nChunks; ci++) { - final long chunkBegin = readLong(); - final long chunkEnd = readLong(); - lastChunk = new Chunk(chunkBegin, chunkEnd); - chunks.add(lastChunk); - } + readChunks(nChunks, chunks); } else if (indexBin == GenomicIndexUtil.MAX_BINS) { // meta data - build the bin so that the count of bins is correct; // but don't attach meta chunks to the bin, or normal queries will be off - for (int ci = 0; ci < nChunks; ci++) { - final long chunkBegin = readLong(); - final long chunkEnd = readLong(); - lastChunk = new Chunk(chunkBegin, chunkEnd); - metaDataChunks.add(lastChunk); - } + readChunks(nChunks, metaDataChunks); metaDataSeen = true; continue; // don't create a Bin } else { @@ -366,27 +349,16 @@ protected int getMaxAddressibleGenomicLocation() { } /** + * @deprecated Use {@link GenomicIndexUtil#regionToBins(int, int)} instead. + * * Get candidate bins for the specified region * @param startPos 1-based start of target region, inclusive. * @param endPos 1-based end of target region, inclusive. * @return bit set for each bin that may contain SAMRecords in the target region. */ + @Deprecated protected BitSet regionToBins(final int startPos, final int endPos) { - final int maxPos = 0x1FFFFFFF; - final int start = (startPos <= 0) ? 0 : (startPos-1) & maxPos; - final int end = (endPos <= 0) ? maxPos : (endPos-1) & maxPos; - if (start > end) { - return null; - } - int k; - final BitSet bitSet = new BitSet(GenomicIndexUtil.MAX_BINS); - bitSet.set(0); - for (k = 1 + (start>>26); k <= 1 + (end>>26); ++k) bitSet.set(k); - for (k = 9 + (start>>23); k <= 9 + (end>>23); ++k) bitSet.set(k); - for (k = 73 + (start>>20); k <= 73 + (end>>20); ++k) bitSet.set(k); - for (k = 585 + (start>>17); k <= 585 + (end>>17); ++k) bitSet.set(k); - for (k = 4681 + (start>>14); k <= 4681 + (end>>14); ++k) bitSet.set(k); - return bitSet; + return GenomicIndexUtil.regionToBins(startPos, endPos); } /** @@ -397,17 +369,35 @@ protected List optimizeChunkList(final List chunks, final long min return Chunk.optimizeChunkList(chunks, minimumOffset); } - private void verifyBAMMagicNumber(final String sourceName) { + protected void verifyIndexMagicNumber(final String sourceName) { // Verify the magic number. seek(0); final byte[] buffer = new byte[4]; readBytes(buffer); - if (!Arrays.equals(buffer, BAMFileConstants.BAM_INDEX_MAGIC)) { + if (!Arrays.equals(buffer, BAMFileConstants.BAI_INDEX_MAGIC)) { throw new RuntimeIOException("Invalid file header in BAM index " + sourceName + ": " + new String(buffer)); } } + /** + * Initialization method used for simplifying the constructor + * hierarchy. + */ + protected void initParameters() { + setSequenceIndexes(getNumberOfReferences()); + } + + protected void readChunks(int nChunks, List chunks) { + Chunk lastChunk; + for (int ci = 0; ci < nChunks; ci++) { + final long chunkBegin = readLong(); + final long chunkEnd = readLong(); + lastChunk = new Chunk(chunkBegin, chunkEnd); + chunks.add(lastChunk); + } + } + private void skipToSequence(final int sequenceIndex) { //Use sequence position cache if available if(sequenceIndexes[sequenceIndex] != -1){ @@ -434,291 +424,36 @@ private void skipToSequence(final int sequenceIndex) { sequenceIndexes[sequenceIndex] = position(); } - private void readBytes(final byte[] bytes) { + protected final void readBytes(final byte[] bytes) { mIndexBuffer.readBytes(bytes); } - private int readInteger() { + protected final int readInteger() { return mIndexBuffer.readInteger(); } - private long readLong() { + protected final long readLong() { return mIndexBuffer.readLong(); } - private void skipBytes(final int count) { + protected final void skipBytes(final int count) { mIndexBuffer.skipBytes(count); } - private void seek(final int position) { + protected final void seek(final long position) { mIndexBuffer.seek(position); } - private int position(){ + protected final long position(){ return mIndexBuffer.position(); } - private abstract static class IndexFileBuffer { - abstract void readBytes(final byte[] bytes); - abstract int readInteger(); - abstract long readLong(); - abstract void skipBytes(final int count); - abstract void seek(final int position); - abstract int position(); - abstract void close(); - } - - /** - * Traditional implementation of BAM index file access using memory mapped files. - */ - private static class MemoryMappedFileBuffer extends IndexFileBuffer { - private MappedByteBuffer mFileBuffer; - - MemoryMappedFileBuffer(final File file) { - try { - // Open the file stream. - final FileInputStream fileStream = new FileInputStream(file); - final FileChannel fileChannel = fileStream.getChannel(); - mFileBuffer = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0L, fileChannel.size()); - mFileBuffer.order(ByteOrder.LITTLE_ENDIAN); - fileChannel.close(); - fileStream.close(); - } catch (final IOException exc) { - throw new RuntimeIOException(exc.getMessage(), exc); - } - } - - @Override - void readBytes(final byte[] bytes) { - mFileBuffer.get(bytes); - } - - @Override - int readInteger() { - return mFileBuffer.getInt(); - } - - @Override - long readLong() { - return mFileBuffer.getLong(); - } - - @Override - void skipBytes(final int count) { - mFileBuffer.position(mFileBuffer.position() + count); - } - - @Override - void seek(final int position) { - mFileBuffer.position(position); - } - - @Override - int position() { - return mFileBuffer.position(); - } - - @Override - void close() { - mFileBuffer = null; - } + protected final SAMSequenceDictionary getBamDictionary() { + return mBamDictionary; } - /** - * Alternative implementation of BAM index file access using regular I/O instead of memory mapping. - * - * This implementation can be more scalable for certain applications that need to access large numbers of BAM files. - * Java provides no way to explicitly release a memory mapping. Instead, you need to wait for the garbage collector - * to finalize the MappedByteBuffer. Because of this, when accessing many BAM files or when querying many BAM files - * sequentially, you cannot easily control the physical memory footprint of the java process. - * This can limit scalability and can have bad interactions with load management software like LSF, forcing you - * to reserve enough physical memory for a worst case scenario. - * The use of regular I/O allows you to trade somewhat slower performance for a small, fixed memory footprint - * if that is more suitable for your application. - */ - private static class RandomAccessFileBuffer extends IndexFileBuffer { - private static final int PAGE_SIZE = 4 * 1024; - private static final int PAGE_OFFSET_MASK = PAGE_SIZE-1; - private static final int PAGE_MASK = ~PAGE_OFFSET_MASK; - private static final int INVALID_PAGE = 1; - private final File mFile; - private RandomAccessFile mRandomAccessFile; - private final int mFileLength; - private int mFilePointer = 0; - private int mCurrentPage = INVALID_PAGE; - private final byte[] mBuffer = new byte[PAGE_SIZE]; - - RandomAccessFileBuffer(final File file) { - mFile = file; - try { - mRandomAccessFile = new RandomAccessFile(file, "r"); - final long fileLength = mRandomAccessFile.length(); - if (fileLength > Integer.MAX_VALUE) { - throw new RuntimeIOException("BAM index file " + mFile + " is too large: " + fileLength); - } - mFileLength = (int) fileLength; - } catch (final IOException exc) { - throw new RuntimeIOException(exc.getMessage(), exc); - } - } - - @Override - void readBytes(final byte[] bytes) { - int resultOffset = 0; - int resultLength = bytes.length; - if (mFilePointer + resultLength > mFileLength) { - throw new RuntimeIOException("Attempt to read past end of BAM index file (file is truncated?): " + mFile); - } - while (resultLength > 0) { - loadPage(mFilePointer); - final int pageOffset = mFilePointer & PAGE_OFFSET_MASK; - final int copyLength = Math.min(resultLength, PAGE_SIZE - pageOffset); - System.arraycopy(mBuffer, pageOffset, bytes, resultOffset, copyLength); - mFilePointer += copyLength; - resultOffset += copyLength; - resultLength -= copyLength; - } - } - - @Override - int readInteger() { - // This takes advantage of the fact that integers in BAM index files are always 4-byte aligned. - loadPage(mFilePointer); - final int pageOffset = mFilePointer & PAGE_OFFSET_MASK; - mFilePointer += 4; - return((mBuffer[pageOffset + 0] & 0xFF) | - ((mBuffer[pageOffset + 1] & 0xFF) << 8) | - ((mBuffer[pageOffset + 2] & 0xFF) << 16) | - ((mBuffer[pageOffset + 3] & 0xFF) << 24)); - } - - @Override - long readLong() { - // BAM index files are always 4-byte aligned, but not necessrily 8-byte aligned. - // So, rather than fooling with complex page logic we simply read the long in two 4-byte chunks. - final long lower = readInteger(); - final long upper = readInteger(); - return ((upper << 32) | (lower & 0xFFFFFFFFL)); - } - - @Override - void skipBytes(final int count) { - mFilePointer += count; - } - - @Override - void seek(final int position) { - mFilePointer = position; - } - - @Override - int position() { - return mFilePointer; - } - - @Override - void close() { - mFilePointer = 0; - mCurrentPage = INVALID_PAGE; - if (mRandomAccessFile != null) { - try { - mRandomAccessFile.close(); - } catch (final IOException exc) { - throw new RuntimeIOException(exc.getMessage(), exc); - } - mRandomAccessFile = null; - } - } - - private void loadPage(final int filePosition) { - final int page = filePosition & PAGE_MASK; - if (page == mCurrentPage) { - return; - } - try { - mRandomAccessFile.seek(page); - final int readLength = Math.min(mFileLength - page, PAGE_SIZE); - mRandomAccessFile.readFully(mBuffer, 0, readLength); - mCurrentPage = page; - } catch (final IOException exc) { - throw new RuntimeIOException("Exception reading BAM index file " + mFile + ": " + exc.getMessage(), exc); - } - } - } - - static class IndexStreamBuffer extends IndexFileBuffer { - private final SeekableStream in; - private final ByteBuffer tmpBuf; - - /** Continually reads from the provided {@link SeekableStream} into the buffer until the specified number of bytes are read, or - * until the stream is exhausted, throwing a {@link RuntimeIOException}. */ - private static void readFully(final SeekableStream in, final byte[] buffer, final int offset, final int length) { - int read = 0; - while (read < length) { - final int readThisLoop; - try { - readThisLoop = in.read(buffer, read, length - read); - } catch (final IOException e) { - throw new RuntimeIOException(e); - } - if (readThisLoop == -1) break; - read += readThisLoop; - } - if (read != length) throw new RuntimeIOException("Expected to read " + length + " bytes, but expired stream after " + read + "."); - } - - public IndexStreamBuffer(final SeekableStream s) { - in = s; - tmpBuf = ByteBuffer.allocate(8); // Enough to fit a long. - tmpBuf.order(ByteOrder.LITTLE_ENDIAN); - } - - @Override - public void close() { - try { in.close(); } - catch (final IOException e) { throw new RuntimeIOException(e); } - } - - @Override - public void readBytes(final byte[] bytes) { - readFully(in, bytes, 0, bytes.length); - } - - @Override - public void seek(final int position) { - try { in.seek(position); } - catch (final IOException e) { throw new RuntimeIOException(e); } - } - - @Override - public int readInteger() { - readFully(in, tmpBuf.array(), 0, 4); - return tmpBuf.getInt(0); - } - - @Override - public long readLong() { - readFully(in, tmpBuf.array(), 0, 8); - return tmpBuf.getLong(0); - } - - @Override - public void skipBytes(final int count) { - try { - for (int s = count; s > 0;) { - final int skipped = (int)in.skip(s); - if (skipped <= 0) - throw new RuntimeIOException("Failed to skip " + s); - s -= skipped; - } - } catch (final IOException e) { throw new RuntimeIOException(e); } - } - - @Override - public int position() { - try { - return (int) in.position(); - } catch (final IOException e) { throw new RuntimeIOException(e); } - } + protected final void setSequenceIndexes (int nReferences) { + sequenceIndexes = new long[nReferences + 1]; + Arrays.fill(sequenceIndexes, -1); } } diff --git a/src/main/java/htsjdk/samtools/BAMFileConstants.java b/src/main/java/htsjdk/samtools/BAMFileConstants.java index be0f36d589..638c56cc56 100644 --- a/src/main/java/htsjdk/samtools/BAMFileConstants.java +++ b/src/main/java/htsjdk/samtools/BAMFileConstants.java @@ -39,7 +39,19 @@ class BAMFileConstants { static final byte[] BAM_MAGIC = "BAM\1".getBytes(); /** - * BAM index file magic number. + * BAM index file magic numbers. + * @deprecated prefer {@link BAMFileConstants#BAI_INDEX_MAGIC} */ + @Deprecated static final byte[] BAM_INDEX_MAGIC = "BAI\1".getBytes(); + static final byte[] BAI_INDEX_MAGIC = "BAI\1".getBytes(); + static final byte[] CSI_INDEX_MAGIC = "CSI\1".getBytes(); + + /** + * CSI index related constants + */ + static final int CSI_MAGIC_OFFSET = 0; + static final int CSI_MINSHIFT_OFFSET = 4; + static final int CSI_AUXDATA_OFFSET = 12; + static final int CSI_CHUNK_SIZE = 16; } diff --git a/src/main/java/htsjdk/samtools/BAMFileReader.java b/src/main/java/htsjdk/samtools/BAMFileReader.java index 9642de8807..970ea87085 100644 --- a/src/main/java/htsjdk/samtools/BAMFileReader.java +++ b/src/main/java/htsjdk/samtools/BAMFileReader.java @@ -378,6 +378,9 @@ protected void enableIndexMemoryMapping(final boolean enabled) { @Override public SamReader.Type type() { + if (mIndexFile != null && getIndexType().equals(SamIndexes.CSI)) { + return SamReader.Type.BAM_CSI_TYPE; + } return SamReader.Type.BAM_TYPE; } @@ -398,16 +401,41 @@ public BAMIndex getIndex() { if(!hasIndex()) throw new SAMException("No index is available for this BAM file."); if(mIndex == null) { - if (mIndexFile != null) - mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping) - : new DiskBasedBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping); - else + SamIndexes samIndex = getIndexType(); + if (samIndex == null) { mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(mIndexStream, getFileHeader().getSequenceDictionary()) - : new DiskBasedBAMFileIndex(mIndexStream, getFileHeader().getSequenceDictionary()); + : new DiskBasedBAMFileIndex(mIndexStream, getFileHeader().getSequenceDictionary()); + } else if (samIndex.equals(SamIndexes.BAI)) { + mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping) + : new DiskBasedBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping); + } else if (samIndex.equals(SamIndexes.CSI)) { + mIndex = new CSIIndex(mIndexFile, mEnableIndexMemoryMapping, getFileHeader().getSequenceDictionary()); + } else { + throw new SAMFormatException("Unsupported BAM index file: " + mIndexFile.getName()); + } } + return mIndex; } + /** + * Return the type of the BAM index, BAI or CSI. + * @return one of {@link SamIndexes#BAI} or {@link SamIndexes#CSI} or null + */ + public SamIndexes getIndexType() { + if (mIndexFile != null) { + if (mIndexFile.getName().toLowerCase().endsWith(BAMIndex.BAI_INDEX_SUFFIX)) { + return SamIndexes.BAI; + } else if (mIndexFile.getName().toLowerCase().endsWith(BAMIndex.CSI_INDEX_SUFFIX)) { + return SamIndexes.CSI; + } + + throw new SAMFormatException("Unknown BAM index file type: " + mIndexFile.getName()); + } + + return null; + } + public void setEagerDecode(final boolean desired) { this.eagerDecode = desired; } @Override diff --git a/src/main/java/htsjdk/samtools/BAMFileWriter.java b/src/main/java/htsjdk/samtools/BAMFileWriter.java index 0a77b5b89f..6af4005077 100644 --- a/src/main/java/htsjdk/samtools/BAMFileWriter.java +++ b/src/main/java/htsjdk/samtools/BAMFileWriter.java @@ -111,7 +111,7 @@ private BAMIndexer createBamIndex(final String pathURI) { try { final String indexFileBase = pathURI.endsWith(BamFileIoUtils.BAM_FILE_EXTENSION) ? pathURI.substring(0, pathURI.lastIndexOf('.')) : pathURI; - final Path indexPath = IOUtil.getPath(indexFileBase + BAMIndex.BAMIndexSuffix); + final Path indexPath = IOUtil.getPath(indexFileBase + BAMIndex.BAI_INDEX_SUFFIX); if (Files.exists(indexPath)) { if (!Files.isWritable(indexPath)) { throw new SAMException("Not creating BAM index since unable to write index file " + indexPath.toUri()); diff --git a/src/main/java/htsjdk/samtools/BAMIndex.java b/src/main/java/htsjdk/samtools/BAMIndex.java index 62c69c79c1..3e23957691 100644 --- a/src/main/java/htsjdk/samtools/BAMIndex.java +++ b/src/main/java/htsjdk/samtools/BAMIndex.java @@ -33,7 +33,13 @@ */ public interface BAMIndex extends Closeable { - public static final String BAMIndexSuffix = ".bai"; + /** + * @deprecated prefer {@link BAMIndex#BAI_INDEX_SUFFIX} instead. + */ + @Deprecated + String BAMIndexSuffix = ".bai"; + String BAI_INDEX_SUFFIX = ".bai"; + String CSI_INDEX_SUFFIX = ".csi"; /** * Gets the compressed chunks which should be searched for the contents of records contained by the span @@ -58,7 +64,7 @@ public interface BAMIndex extends Closeable { * @param reference the reference of interest * @return meta data for the reference */ - public BAMIndexMetaData getMetaData(int reference); + BAMIndexMetaData getMetaData(int reference); /** * Close the index and release any associated resources. diff --git a/src/main/java/htsjdk/samtools/BAMIndexMetaData.java b/src/main/java/htsjdk/samtools/BAMIndexMetaData.java index 3dceab21af..0efffa8cd0 100644 --- a/src/main/java/htsjdk/samtools/BAMIndexMetaData.java +++ b/src/main/java/htsjdk/samtools/BAMIndexMetaData.java @@ -212,17 +212,22 @@ long getLastOffset() { } /** - * Prints meta-data statistics from BAM index (.bai) file + * Prints meta-data statistics from BAM index (.bai or .csi) file * Statistics include count of aligned and unaligned reads for each reference sequence * and a count of all records with no start coordinate */ static public void printIndexStats(final File inputBamFile) { try { final BAMFileReader bam = new BAMFileReader(inputBamFile, null, false, false, ValidationStringency.SILENT, new DefaultSAMRecordFactory()); - if (!bam.hasIndex()) { + if (!bam.hasIndex() || bam.getIndexType() == null) { throw new SAMException("No index for bam file " + inputBamFile); } + BAMIndexMetaData[] data = getIndexStats(bam); + if (data == null) { + throw new SAMException("Exception in getting index statistics"); + } + // read through all the bins of every reference. int nRefs = bam.getFileHeader().getSequenceDictionary().size(); for (int i = 0; i < nRefs; i++) { @@ -245,7 +250,7 @@ static public void printIndexStats(final File inputBamFile) { } /** - * Prints meta-data statistics from BAM index (.bai) file + * Prints meta-data statistics from BAM index (.bai or .csi) file * Statistics include count of aligned and unaligned reads for each reference sequence * and a count of all records with no start coordinate */ diff --git a/src/main/java/htsjdk/samtools/BamFileIoUtils.java b/src/main/java/htsjdk/samtools/BamFileIoUtils.java index b5c587acc0..72cb2b5f9e 100644 --- a/src/main/java/htsjdk/samtools/BamFileIoUtils.java +++ b/src/main/java/htsjdk/samtools/BamFileIoUtils.java @@ -126,7 +126,7 @@ public static void gatherWithBlockCopying(final List bams, final File outp if (createMd5) out = new Md5CalculatingOutputStream(out, new File(output.getAbsolutePath() + ".md5")); File indexFile = null; if (createIndex) { - indexFile = new File(output.getParentFile(), IOUtil.basename(output) + BAMIndex.BAMIndexSuffix); + indexFile = new File(output.getParentFile(), IOUtil.basename(output) + BAMIndex.BAI_INDEX_SUFFIX); out = new StreamInflatingIndexingOutputStream(out, indexFile); } @@ -161,7 +161,7 @@ private static OutputStream buildOutputStream(final File outputFile, final boole outputStream = new Md5CalculatingOutputStream(outputStream, new File(outputFile.getAbsolutePath() + ".md5")); } if (createIndex) { - outputStream = new StreamInflatingIndexingOutputStream(outputStream, new File(outputFile.getParentFile(), IOUtil.basename(outputFile) + BAMIndex.BAMIndexSuffix)); + outputStream = new StreamInflatingIndexingOutputStream(outputStream, new File(outputFile.getParentFile(), IOUtil.basename(outputFile) + BAMIndex.BAI_INDEX_SUFFIX)); } return outputStream; } diff --git a/src/main/java/htsjdk/samtools/BamIndexValidator.java b/src/main/java/htsjdk/samtools/BamIndexValidator.java index fb1c8d8a27..f3b5cbe5ae 100644 --- a/src/main/java/htsjdk/samtools/BamIndexValidator.java +++ b/src/main/java/htsjdk/samtools/BamIndexValidator.java @@ -43,49 +43,73 @@ public static int exhaustivelyTestIndex(final SamReader reader) { // throws Exce // look at all chunk offsets in a linear index to make sure they are valid if (reader.indexing().hasBrowseableIndex()) { + if (SamIndexes.BAI.fileNameSuffix.endsWith(reader.type().indexExtension())) { + + // content is from an existing bai file + final CachingBAMFileIndex existingIndex = (CachingBAMFileIndex) reader.indexing().getBrowseableIndex(); // new CachingBAMFileIndex(inputBai, null); + final int numRefs = existingIndex.getNumberOfReferences(); + + int chunkCount = 0; + int indexCount = 0; + for (int i = 0; i < numRefs; i++) { + final BAMIndexContent content = existingIndex.getQueryResults(i); + for (final Chunk c : content.getAllChunks()) { + final CloseableIterator iter = ((SamReader.PrimitiveSamReaderToSamReaderAdapter) reader).iterator(new BAMFileSpan(c)); + chunkCount++; + SAMRecord sam = null; + try { + sam = iter.next(); + iter.close(); + } catch (final Exception e) { + throw new SAMException("Exception in BamIndexValidator. Last good record " + sam + " in chunk " + c + " chunkCount=" + chunkCount, e); + } + } + // also seek to every position in the linear index + // final BAMRecordCodec bamRecordCodec = new BAMRecordCodec(reader.getFileHeader()); + // bamRecordCodec.setInputStream(reader.getInputStream()); + + final LinearIndex linearIndex = content.getLinearIndex(); + for (final long l : linearIndex.getIndexEntries()) { + try { + if (l != 0) { + final CloseableIterator iter = ((SamReader.PrimitiveSamReaderToSamReaderAdapter) reader).iterator(new BAMFileSpan(new Chunk(l, l + 1))); + final SAMRecord sam = iter.next(); // read the first record identified by the linear index + indexCount++; + iter.close(); + } + } catch (final Exception e) { + throw new SAMException("Exception in BamIndexValidator. Linear index access failure " + l + " indexCount=" + indexCount, e); + } - // content is from an existing bai file - final CachingBAMFileIndex existingIndex = (CachingBAMFileIndex) reader.indexing().getBrowseableIndex(); // new CachingBAMFileIndex(inputBai, null); - final int numRefs = existingIndex.getNumberOfReferences(); - - int chunkCount = 0; - int indexCount = 0; - for (int i = 0; i < numRefs; i++) { - final BAMIndexContent content = existingIndex.getQueryResults(i); - for (final Chunk c : content.getAllChunks()) { - final CloseableIterator iter = ((SamReader.PrimitiveSamReaderToSamReaderAdapter) reader).iterator(new BAMFileSpan(c)); - chunkCount++; - SAMRecord sam = null; - try { - sam = iter.next(); - iter.close(); - } catch (final Exception e) { - throw new SAMException("Exception in BamIndexValidator. Last good record " + sam + " in chunk " + c + " chunkCount=" + chunkCount, e); } } - // also seek to every position in the linear index - // final BAMRecordCodec bamRecordCodec = new BAMRecordCodec(reader.getFileHeader()); - // bamRecordCodec.setInputStream(reader.getInputStream()); - - final LinearIndex linearIndex = content.getLinearIndex(); - for (final long l : linearIndex.getIndexEntries()) { - try { - if (l != 0) { - final CloseableIterator iter = ((SamReader.PrimitiveSamReaderToSamReaderAdapter) reader).iterator(new BAMFileSpan(new Chunk(l, l + 1))); - final SAMRecord sam = iter.next(); // read the first record identified by the linear index - indexCount++; + return chunkCount; + // System.out.println("Found " chunkCount + " chunks in test " + inputBai + + // " linearIndex positions = " + indexCount); + } else if (SamIndexes.CSI.fileNameSuffix.endsWith(reader.type().indexExtension())) { + + final CSIIndex existingIndex = (CSIIndex) reader.indexing().getBrowseableIndex(); // new CachingBAMFileIndex(inputBai, null); + final int numRefs = existingIndex.getNumberOfReferences(); + + int chunkCount = 0; + for (int i = 0; i < numRefs; i++) { + final BAMIndexContent content = existingIndex.getQueryResults(i); + for (final Chunk c : content.getAllChunks()) { + final CloseableIterator iter = ((SamReader.PrimitiveSamReaderToSamReaderAdapter) reader).iterator(new BAMFileSpan(c)); + chunkCount++; + SAMRecord sam = null; + try { + sam = iter.next(); iter.close(); + } catch (final Exception e) { + throw new SAMException("Exception in BamIndexValidator. Last good record " + sam + " in chunk " + c + " chunkCount=" + chunkCount, e); } - } catch (final Exception e) { - throw new SAMException("Exception in BamIndexValidator. Linear index access failure " + l + " indexCount=" + indexCount, e); } - } + return chunkCount; } - return chunkCount; - // System.out.println("Found " chunkCount + " chunks in test " + inputBai + - // " linearIndex positions = " + indexCount); - } // else not a bam file with a browseable index + } + // else not a bam file with a browseable index // System.err.println("No browseableIndex for reader"); return 0; } @@ -100,50 +124,79 @@ public static int exhaustivelyTestIndex(final SamReader reader) { // throws Exce public static int lessExhaustivelyTestIndex(final SamReader reader) { // look at all chunk offsets in a linear index to make sure they are valid if (reader.indexing().hasBrowseableIndex()) { + if (SamIndexes.BAI.fileNameSuffix.endsWith(reader.type().indexExtension())) { + + // content is from an existing bai file + final CachingBAMFileIndex existingIndex = (CachingBAMFileIndex) reader.indexing().getBrowseableIndex(); + final int numRefs = existingIndex.getNumberOfReferences(); + + int chunkCount = 0; + int indexCount = 0; + for (int i = 0; i < numRefs; i++) { + + final BAMIndexContent content = existingIndex.getQueryResults(i); + + final List chunks = content.getAllChunks(); + final int numChunks = chunks.size(); + // We are looking only at the first and last chunks + for (final int chunkNo : Arrays.asList(0, numChunks - 1)) { + chunkCount++; - // content is from an existing bai file - final CachingBAMFileIndex existingIndex = (CachingBAMFileIndex) reader.indexing().getBrowseableIndex(); - final int numRefs = existingIndex.getNumberOfReferences(); - - int chunkCount = 0; - int indexCount = 0; - for (int i = 0; i < numRefs; i++) { - - final BAMIndexContent content = existingIndex.getQueryResults(i); - - final List chunks = content.getAllChunks(); - final int numChunks = chunks.size(); - // We are looking only at the first and last chunks - for (final int chunkNo : Arrays.asList(0, numChunks - 1)) { - chunkCount++; - - final Chunk c = chunks.get(chunkNo); - final CloseableIterator iter = ((SamReader.PrimitiveSamReaderToSamReaderAdapter) reader).iterator(new BAMFileSpan(c)); - try { - final SAMRecord sam = iter.next(); - iter.close(); - } catch (final Exception e) { - throw new SAMException("Exception querying chunk " + chunkNo + " from reference index " + i, e); + final Chunk c = chunks.get(chunkNo); + final CloseableIterator iter = ((SamReader.PrimitiveSamReaderToSamReaderAdapter) reader).iterator(new BAMFileSpan(c)); + try { + final SAMRecord sam = iter.next(); + iter.close(); + } catch (final Exception e) { + throw new SAMException("Exception querying chunk " + chunkNo + " from reference index " + i, e); + } + } + + // also seek to first and last position in the linear index + final long linearIndexEntries[] = content.getLinearIndex().getIndexEntries(); + for (final int binNo : Arrays.asList(0, linearIndexEntries.length - 1)) { + indexCount++; + final long l = linearIndexEntries[binNo]; + try { + if (l != 0) { + final CloseableIterator iter = ((SamReader.PrimitiveSamReaderToSamReaderAdapter) reader).iterator(new BAMFileSpan(new Chunk(l, l + 1))); + final SAMRecord sam = iter.next(); // read the first record identified by the linear index + iter.close(); + } + } catch (final Exception e) { + throw new SAMException("Exception in BamIndexValidator. Linear index access failure " + l + " indexCount=" + indexCount, e); + } } } + return chunkCount; + } else if (SamIndexes.CSI.fileNameSuffix.endsWith(reader.type().indexExtension())) { + + final CSIIndex existingIndex = (CSIIndex) reader.indexing().getBrowseableIndex(); // new CachingBAMFileIndex(inputBai, null); + final int numRefs = existingIndex.getNumberOfReferences(); + + int chunkCount = 0; + for (int i = 0; i < numRefs; i++) { + + final BAMIndexContent content = existingIndex.getQueryResults(i); + + final List chunks = content.getAllChunks(); + final int numChunks = chunks.size(); + // We are looking only at the first and last chunks + for (final int chunkNo : Arrays.asList(0, numChunks - 1)) { + chunkCount++; - // also seek to first and last position in the linear index - final long linearIndexEntries[] = content.getLinearIndex().getIndexEntries(); - for (final int binNo : Arrays.asList(0, linearIndexEntries.length - 1)) { - indexCount++; - final long l = linearIndexEntries[binNo]; - try { - if (l != 0) { - final CloseableIterator iter = ((SamReader.PrimitiveSamReaderToSamReaderAdapter) reader).iterator(new BAMFileSpan(new Chunk(l, l + 1))); - final SAMRecord sam = iter.next(); // read the first record identified by the linear index + final Chunk c = chunks.get(chunkNo); + final CloseableIterator iter = ((SamReader.PrimitiveSamReaderToSamReaderAdapter) reader).iterator(new BAMFileSpan(c)); + try { + final SAMRecord sam = iter.next(); iter.close(); + } catch (final Exception e) { + throw new SAMException("Exception querying chunk " + chunkNo + " from reference index " + i, e); } - } catch (final Exception e) { - throw new SAMException("Exception in BamIndexValidator. Linear index access failure " + l + " indexCount=" + indexCount, e); } } + return chunkCount; } - return chunkCount; } // else it's not a bam file with a browseable index return 0; diff --git a/src/main/java/htsjdk/samtools/BinWithOffset.java b/src/main/java/htsjdk/samtools/BinWithOffset.java new file mode 100644 index 0000000000..09123889e0 --- /dev/null +++ b/src/main/java/htsjdk/samtools/BinWithOffset.java @@ -0,0 +1,21 @@ +package htsjdk.samtools; + +/** + * An individual bin of a CSI index for BAM files. + * Extends the BAI index bin {@link Bin} with a 64 bit value, + * representing the virtual file offset of the first + * overlapping record. + */ +public class BinWithOffset extends Bin { + + private final long lOffset; + + public long getlOffset() { + return lOffset; + } + + public BinWithOffset(int referenceSequence, int binNumber, long lOffset) { + super(referenceSequence, binNumber); + this.lOffset = lOffset; + } +} diff --git a/src/main/java/htsjdk/samtools/CRAMFileReader.java b/src/main/java/htsjdk/samtools/CRAMFileReader.java index 889ac4c290..70eeb3074c 100644 --- a/src/main/java/htsjdk/samtools/CRAMFileReader.java +++ b/src/main/java/htsjdk/samtools/CRAMFileReader.java @@ -277,7 +277,7 @@ public BAMIndex getIndex() { if (mIndex == null) { final SAMSequenceDictionary dictionary = getFileHeader() .getSequenceDictionary(); - if (mIndexFile.getName().endsWith(BAMIndex.BAMIndexSuffix)) { + if (mIndexFile.getName().endsWith(BAMIndex.BAI_INDEX_SUFFIX)) { mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(mIndexFile, dictionary, mEnableIndexMemoryMapping) : new DiskBasedBAMFileIndex(mIndexFile, dictionary, diff --git a/src/main/java/htsjdk/samtools/CSIIndex.java b/src/main/java/htsjdk/samtools/CSIIndex.java new file mode 100644 index 0000000000..1931d2ddca --- /dev/null +++ b/src/main/java/htsjdk/samtools/CSIIndex.java @@ -0,0 +1,502 @@ +package htsjdk.samtools; + +import htsjdk.samtools.seekablestream.SeekablePathStream; +import htsjdk.samtools.seekablestream.SeekableStream; +import htsjdk.samtools.util.RuntimeIOException; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import java.util.*; + +/** + * Implementation of the CSI index for BAM files. + * The CSI index extends the BAI index by allowing a more flexible + * binning scheme, with variable depth (number of levels) and + * bin sizes, thus allowing for genomic regions longer than 2^29-1. + */ +public class CSIIndex extends AbstractBAMFileIndex implements BrowseableBAMIndex { + + /** + * Don't add default values to these variables, as they will override the + * value assign to them by the {@link #initParameters()} method, called from + * the superclass constructor. + */ + private int binDepth; + private int minShift; + private int maxBins; + private int maxSpan; + private byte[] auxData; + private int nReferences; + private long metaDataPos; + + /** + * Constructors + */ + + public CSIIndex(final SeekableStream stream, final SAMSequenceDictionary dictionary) { + this(new IndexStreamBuffer(stream), stream.getSource(), dictionary); + } + + public CSIIndex(final Path path, final SAMSequenceDictionary dictionary) throws IOException { + this(new SeekablePathStream(path), dictionary); + } + + public CSIIndex(final File file, boolean enableMemoryMapping, final SAMSequenceDictionary dictionary) { + this(IndexFileBufferFactory.getBuffer(file, enableMemoryMapping), file.getName(), dictionary); + } + + private CSIIndex(final IndexFileBuffer indexFileBuffer, final String source, final SAMSequenceDictionary dictionary) { + super(indexFileBuffer, source, dictionary); + } + + /** + * Getters and setters + */ + + /** + * Bin depth is the number of levels of the index. By default, + * BAI has 6 levels. CSI makes this variable. + */ + public int getBinDepth() { + return binDepth; + } + + private void setBinDepth(int binDepth) { this.binDepth = binDepth; } + + /** + * 2^(min shift) is the smallest width of a bin + */ + public int getMinShift() { + return minShift; + } + + private void setMinShift(int minShift) { + this.minShift = minShift; + } + + public int getMaxBins() { + return maxBins; + } + + private void setMaxBins(int binDepth) { this.maxBins = ((1<<3*binDepth) - 1)/7; } + + public int getMaxSpan() { + return maxSpan; + } + + private void setMaxSpan(int binDepth, int minShift) { + this.maxSpan = 1<<(minShift + 3*(binDepth - 1)); + } + + public byte[] getAuxData() { return auxData; } + + private void setAuxData(byte[] auxData) { this.auxData = auxData; } + + @Override + public int getNumberOfReferences() { return nReferences; } + + private void setNumberOfReferences(int nReferences) { this.nReferences = nReferences; } + + /** + * Computes the number of bins on the given level. + * @param levelNumber Level for which to compute the size. + * @return + */ + @Override + public int getLevelSize(final int levelNumber) { + if (levelNumber >= getBinDepth()) { + throw new SAMException("Level number (" + levelNumber + ") is greater than or equal to maximum (" + getBinDepth() + ")."); + } + return 1<<3*(levelNumber); + } + + /** + * Extends the functionality of {@link AbstractBAMFileIndex#getFirstBinInLevel(int)} , + * which cannot be overridden due to its static nature. + */ + public int getFirstBinInLevelForCSI(final int levelNumber) { + if (levelNumber >= getBinDepth()) { + throw new SAMException("Level number (" + levelNumber + ") is greater than or equal to maximum (" + getBinDepth() + ")."); + } + return ((1<<3*levelNumber) - 1)/7; + } + + @Override + public int getLevelForBin(Bin bin) { + if(bin == null || bin.getBinNumber() > getMaxBins()) { + throw new SAMException("Tried to get level for invalid bin: " + bin); + } + for (int i = getBinDepth()-1; i > -1 ; i--) { + if (bin.getBinNumber() >= getFirstBinInLevelForCSI(i)) { + return i; + } + } + throw new SAMException("Unable to find correct level for bin: " + bin); + } + + @Override + public int getFirstLocusInBin(Bin bin) { + if(bin == null || bin.getBinNumber() > getMaxBins()) { + throw new SAMException("Tried to get first locus for invalid bin: " + bin); + } + int level = getLevelForBin(bin); + int firstBinOnLevel = getFirstBinInLevelForCSI(level); + int levelSize = getLevelSize(level); + + return (bin.getBinNumber() - firstBinOnLevel)*(getMaxSpan()/levelSize) + 1; + } + + @Override + public int getLastLocusInBin(Bin bin) { + if(bin == null || bin.getBinNumber() > getMaxBins()) { + throw new SAMException("Tried to get last locus for invalid bin: " + bin); + } + int level = getLevelForBin(bin); + int firstBinOnLevel = getFirstBinInLevelForCSI(level); + int levelSize = getLevelSize(level); + + return (bin.getBinNumber() - firstBinOnLevel + 1)*(getMaxSpan()/levelSize); + } + + @Override + public BinList getBinsOverlapping(int referenceIndex, int startPos, int endPos) { + final BitSet regionBins = GenomicIndexUtil.regionToBins(startPos, endPos, getMinShift(), getBinDepth()); + if (regionBins == null) { + return null; + } + return new BinList(referenceIndex,regionBins); + } + + @Override + public BAMFileSpan getSpanOverlapping(int referenceIndex, int startPos, int endPos) { + final BAMIndexContent queryResults = query(referenceIndex, startPos, endPos); + int initialBinNumber = getFirstBinInLevelForCSI(getBinDepth() - 1) + (startPos - 1 >> getMinShift()); + long minimumOffset = 0L; + Bin targetBin; + + if(queryResults == null) { + return null; + } + + /** Compute 'minimumOffset' by searching the lowest level bin containing 'startPos'. + If the computed bin is not in the index, try the next bin to the left, belonging + to the same parent. If it is the first sibling bin, try the parent bin. + */ + + do { + int firstBinNumber; + targetBin = queryResults.getBins().getBin(initialBinNumber); + if (targetBin != null) { + break; + } + firstBinNumber = (getParentBinNumber(initialBinNumber)<<3) + 1; + if (initialBinNumber > firstBinNumber) { + initialBinNumber--; + } else { + initialBinNumber = getParentBinNumber(initialBinNumber); + } + } while(initialBinNumber != 0); + + if (initialBinNumber == 0) { + targetBin = queryResults.getBins().getBin(initialBinNumber); + } + + if (targetBin != null && targetBin instanceof BinWithOffset) { + minimumOffset = ((BinWithOffset) targetBin).getlOffset(); + } + + List chunkList = new ArrayList(); + for(final Chunk chunk: queryResults.getAllChunks()) { + chunkList.add(chunk.clone()); + } + + chunkList = Chunk.optimizeChunkList(chunkList, minimumOffset); + return new BAMFileSpan(chunkList); + } + + @Override + public BAMFileSpan getSpanOverlapping(final Bin bin) { + if(bin == null) { + return null; + } + + final int referenceSequence = bin.getReferenceSequence(); + final BAMIndexContent queryResults = getQueryResults(referenceSequence); + + if(queryResults == null) { + return null; + } + + final int binLevel = getLevelForBin(bin); + final int firstLocusInBin = getFirstLocusInBin(bin); + long minimumOffset = bin instanceof BinWithOffset ? ((BinWithOffset)bin).getlOffset() : 0L; + + // Add the specified bin to the tree if it exists. + final List binTree = new ArrayList(); + if(queryResults.containsBin(bin)) { + binTree.add(queryResults.getBins().getBin(bin.getBinNumber())); + } + + int currentBinLevel = binLevel; + while(--currentBinLevel >= 0) { + final int binStart = getFirstBinInLevelForCSI(currentBinLevel); + final int binWidth = getMaxSpan()/getLevelSize(currentBinLevel); + final int parentBinNumber = firstLocusInBin/binWidth + binStart; + final Bin parentBin = queryResults.getBins().getBin(parentBinNumber); + if(parentBin != null && queryResults.containsBin(parentBin)) { + binTree.add(parentBin); + } + } + + List chunkList = new ArrayList(); + for(final Bin coveringBin: binTree) { + for(final Chunk chunk: coveringBin.getChunkList()) + chunkList.add(chunk.clone()); + } + + chunkList = Chunk.optimizeChunkList(chunkList, minimumOffset); + return new BAMFileSpan(chunkList); + } + + @Override + public long getStartOfLastLinearBin() { + if (metaDataPos > 0 && position() != metaDataPos) { + seek(metaDataPos); + } + + final int sequenceIndex = getNumberOfReferences(); + long loffset = -1L; + for (int i = 0; i < sequenceIndex; i++) { + + final int nBins = readInteger(); // n_bin + + for (int j = 0; j < nBins; j++) { + readInteger(); // bin + loffset = readLong(); // loffset + final int nChunks = readInteger(); // n_chunk + skipBytes(BAMFileConstants.CSI_CHUNK_SIZE * nChunks); + } + } + + return loffset; + } + + @Override + protected void verifyIndexMagicNumber(final String sourceName) { + // Verify the magic number. + if (BAMFileConstants.CSI_MAGIC_OFFSET != position()) { + seek(BAMFileConstants.CSI_MAGIC_OFFSET); + } + final byte[] buffer = new byte[BAMFileConstants.CSI_MINSHIFT_OFFSET]; + readBytes(buffer); // magic + if (!Arrays.equals(buffer, BAMFileConstants.CSI_INDEX_MAGIC)) { + throw new RuntimeIOException("Invalid file header in BAM CSI index " + sourceName + + ": " + new String(buffer)); + } + } + + private void readMinShiftAndBinDepth() { + if (BAMFileConstants.CSI_MINSHIFT_OFFSET != position()) { + seek(BAMFileConstants.CSI_MINSHIFT_OFFSET); + } + setMinShift(readInteger()); // min_shift + setBinDepth(readInteger() + 1); // depth - HTSlib doesn't count the first level (bin 0) + setMaxBins(binDepth); + setMaxSpan(binDepth, minShift); + } + + private void readAuxDataAndNRef() { + if (BAMFileConstants.CSI_AUXDATA_OFFSET != position()) { + seek(BAMFileConstants.CSI_AUXDATA_OFFSET); + } + //set the aux data length first + byte[] auxData = new byte[readInteger()]; // l_aux + readBytes(auxData); // aux + setAuxData(auxData); + setNumberOfReferences(readInteger()); // n_ref + metaDataPos = position(); // save the metadata position for delayed reading + } + + @Override + protected final void initParameters() { + readMinShiftAndBinDepth(); + readAuxDataAndNRef(); + setSequenceIndexes(getNumberOfReferences()); + } + + public int getParentBinNumber(int binNumber) { + if (binNumber >= getMaxBins()) { + throw new SAMException("Tried to get parent bin for invalid bin (" + binNumber + ")."); + } + if (binNumber == 0) { + return 0; + } + return (binNumber - 1) >> 3; + } + + public int getParentBinNumber(Bin bin) { + if (bin == null) { + throw new SAMException("Tried to get parent bin for null bin."); + } + return getParentBinNumber(bin.getBinNumber()); + } + + /** + * The maximum possible bin number for this reference sequence. + * This is based on the maximum coordinate position of the reference + * which is based on the size of the reference + */ + private int getMaxBinNumberForReference(final int reference) { + try { + final int sequenceLength = getBamDictionary().getSequence(reference).getSequenceLength(); + return getFirstBinInLevelForCSI(getBinDepth() - 1) + (sequenceLength >> getMinShift()); + } catch (final Exception e) { + return getMaxBins(); + } + } + + @Override + protected BAMIndexContent query(final int referenceSequence, final int startPos, final int endPos) { + if (metaDataPos > 0 && position() != metaDataPos) { + seek(metaDataPos); + } + + final List metaDataChunks = new ArrayList(); + + final int sequenceCount = getNumberOfReferences(); + + if (referenceSequence >= sequenceCount) { + return null; + } + + final BitSet regionBins = GenomicIndexUtil.regionToBins(startPos, endPos, getMinShift(), getBinDepth()); + if (regionBins == null) { + return null; + } + + skipToSequence(referenceSequence); + + final int binCount = readInteger(); // n_bin + boolean metaDataSeen = false; + final Bin[] bins = new BinWithOffset[getMaxBinNumberForReference(referenceSequence) +1]; + for (int binNumber = 0; binNumber < binCount; binNumber++) { + final int indexBin = readInteger(); // bin + final long lOffset = readLong(); // l_offset + final int nChunks = readInteger(); // n_chunk + List chunks; + + Chunk lastChunk = null; + if (regionBins.get(indexBin)) { + chunks = new ArrayList(nChunks); + readChunks(nChunks, chunks); + } else if (indexBin == getMaxBins() + 1) { + // meta data - build the bin so that the count of bins is correct; + // but don't attach meta chunks to the bin, or normal queries will be off + readChunks(nChunks, metaDataChunks); + metaDataSeen = true; + continue; // don't create a Bin + } else { + skipBytes(BAMFileConstants.CSI_CHUNK_SIZE * nChunks); + chunks = Collections.emptyList(); + } + final BinWithOffset bin = new BinWithOffset(referenceSequence, indexBin, lOffset); + bin.setChunkList(chunks); + bin.setLastChunk(lastChunk); + bins[indexBin] = bin; + } + + return new BAMIndexContent(referenceSequence, bins, binCount - (metaDataSeen? 1 : 0), new BAMIndexMetaData(metaDataChunks), null); + } + + /** + * Return meta data for the given reference including information about number of aligned, unaligned, and noCoordinate records + * + * @param reference the reference of interest + * @return meta data for the reference + */ + @Override + public BAMIndexMetaData getMetaData(final int reference) { + if (metaDataPos > 0 && position() != metaDataPos) { + seek(metaDataPos); + } + + final List metaDataChunks = new ArrayList(); + + final int sequenceCount = getNumberOfReferences(); + + if (reference >= sequenceCount) { + return null; + } + + skipToSequence(reference); + + final int binCount = readInteger(); // n_bin + for (int binNumber = 0; binNumber < binCount; binNumber++) { + final int indexBin = readInteger(); // bin + final long lOffset = readLong(); // loffset + final int nChunks = readInteger(); // n_chunk + if (indexBin == getMaxBins() + 1) { + readChunks(nChunks, metaDataChunks); + } else { + skipBytes(BAMFileConstants.CSI_CHUNK_SIZE * nChunks); + } + } + return new BAMIndexMetaData(metaDataChunks); + } + + /** + * Returns count of records unassociated with any reference. Call before the index file is closed + * + * @return meta data at the end of the bam index that indicates count of records holding no coordinates + * or null if no meta data (old index format) + */ + @Override + public Long getNoCoordinateCount() { + if (metaDataPos > 0 && position() != metaDataPos) { + seek(metaDataPos); + } + + skipToSequence(getNumberOfReferences()); + try { // in case of old index file without meta data + return readLong(); + } catch (final Exception e) { + return null; + } + } + + @Override + public BAMIndexContent getQueryResults(final int referenceSequence) { + return query(referenceSequence, 1, -1); + } + + private void skipToSequence(final int sequenceIndex) { + if(sequenceIndex > getNumberOfReferences()) { + throw new SAMException("Sequence index (" + sequenceIndex + ") is greater than maximum (" + getNumberOfReferences() + ")."); + } + + //Use sequence position cache if available + if(sequenceIndexes[sequenceIndex] != -1){ + seek(sequenceIndexes[sequenceIndex]); + return; + } + + if (metaDataPos > 0 && position() != metaDataPos) { + seek(metaDataPos); + } + for (int i = 0; i < sequenceIndex; i++) { + + final int nBins = readInteger(); // n_bin + + for (int j = 0; j < nBins; j++) { + readInteger(); // bin + readLong(); // loffset + final int nChunks = readInteger(); // n_chunk + skipBytes(BAMFileConstants.CSI_CHUNK_SIZE * nChunks); + } + } + + //Update sequence position cache + sequenceIndexes[sequenceIndex] = position(); + } +} diff --git a/src/main/java/htsjdk/samtools/CompressedIndexFileBuffer.java b/src/main/java/htsjdk/samtools/CompressedIndexFileBuffer.java new file mode 100644 index 0000000000..fa3328a13f --- /dev/null +++ b/src/main/java/htsjdk/samtools/CompressedIndexFileBuffer.java @@ -0,0 +1,91 @@ +package htsjdk.samtools; + +import htsjdk.samtools.util.BinaryCodec; +import htsjdk.samtools.util.BlockCompressedInputStream; +import htsjdk.samtools.util.RuntimeIOException; + +import java.io.File; +import java.io.IOException; + +/** + * CSI index files produced by SAMtools are BGZF compressed by default. This class + * adds the ability to read such files. + */ +class CompressedIndexFileBuffer implements IndexFileBuffer { + + private final BlockCompressedInputStream mCompressedStream; + private final BinaryCodec binaryCodec; + + CompressedIndexFileBuffer(File file) { + try { + mCompressedStream = new BlockCompressedInputStream(file); + binaryCodec = new BinaryCodec(mCompressedStream); + } catch (IOException ioe) { + throw(new RuntimeIOException("Construction error of CSI compressed stream: " + ioe)); + } + } + + @Override + public void readBytes(final byte[] bytes) { + binaryCodec.readBytes(bytes); + } + + @Override + public int readInteger() { + return binaryCodec.readInt(); + } + + @Override + public long readLong() { + return binaryCodec.readLong(); + } + + @Override + public void skipBytes(final int count) { + if (mCompressedStream == null) { + throw new SAMException("Null input stream."); + } + + try { + mCompressedStream.skip(count); + } catch (IOException ioe) { + throw(new RuntimeIOException("Skip error in CSI compressed stream: " + ioe)); + } + } + + @Override + public void seek(final long position) { + if (mCompressedStream == null) { + throw new SAMException("Null input stream."); + } + + try { + mCompressedStream.seek(position); + } catch (IOException ioe) { + throw(new RuntimeIOException("Seek error in CSI compressed stream: " + ioe)); + } + } + + @Override + public long position() { + if (mCompressedStream == null) { + throw new SAMException("Null input stream."); + } + + return mCompressedStream.getPosition(); + } + + @Override + public void close() { + if (mCompressedStream == null) { + throw new SAMException("Null input stream."); + } + + try { + mCompressedStream.close(); + } catch (IOException ioe) { + throw(new RuntimeIOException("Close error in CSI compressed stream: " + ioe)); + } + } + +} diff --git a/src/main/java/htsjdk/samtools/GenomicIndexUtil.java b/src/main/java/htsjdk/samtools/GenomicIndexUtil.java index f634932618..5d0aa7c260 100644 --- a/src/main/java/htsjdk/samtools/GenomicIndexUtil.java +++ b/src/main/java/htsjdk/samtools/GenomicIndexUtil.java @@ -70,6 +70,29 @@ public static int regionToBin(final int beg, int end) return 0; } + /** + * calculate the bin given an alignment in [beg,end) + * Described in "The Human Genome Browser at UCSC. Kent & al. doi: 10.1101/gr.229102 " + * @param beg 0-based start of read (inclusive) + * @param end 0-based end of read (exclusive) + * @param minShift minimum bin width (2^minShift) + * @param binDepth number of levels in the binning scheme (including bin 0) + */ + public static int regionToBin(final int beg, int end, final int minShift, final int binDepth) + { + --end; + int binWidth = minShift, maxShift = minShift + 3*(binDepth-1); + + while (binWidth < maxShift) { + if (beg>>binWidth == end>>binWidth) { + return ((1<<(maxShift - binWidth)) - 1)/7 + (beg>>binWidth); + } + binWidth+=3; + } + + return 0; + } + // TODO: It is disturbing that regionToBins is 0-based, but regionToBins is 1-based. // TODO: It is also suspicious that regionToBins decrements endPos. Test it! // TODO: However end is decremented in regionToBins so perhaps there is no conflict. @@ -96,5 +119,35 @@ public static BitSet regionToBins(final int startPos, final int endPos) { for (k = 4681 + (start>>14); k <= 4681 + (end>>14); ++k) bitSet.set(k); return bitSet; } + /** + * Get candidate bins for the specified region + * @param startPos 1-based start of target region, inclusive. + * @param endPos 1-based end of target region, inclusive. + * @return bit set for each bin that may contain SAMRecords in the target region. + * @param minShift minimum bin width (2^minShift). + * @param binDepth number of levels in the binning scheme (including bin 0). + */ + public static BitSet regionToBins(final int startPos, final int endPos, final int minShift, final int binDepth) { + final int maxPos = (1<<(minShift + 3*(binDepth - 1)))-1; + final int start = (startPos <= 0) ? 0 : (startPos-1) & maxPos; + final int end = (endPos <= 0) ? maxPos : (endPos-1) & maxPos; + if (start > end) { + return null; + } + int k, firstBinOnLevel = 1, level = 1, binWidth = minShift + 3*(binDepth - 2); + final BitSet bitSet = new BitSet(((1<<3*binDepth) - 1)/7); + + bitSet.set(0); + while (level < binDepth) { + for (k = firstBinOnLevel + (start >> binWidth); k <= firstBinOnLevel + (end >> binWidth); ++k) { + bitSet.set(k); + } + + firstBinOnLevel = firstBinOnLevel + (1<<3*level); + binWidth = binWidth - 3; + level = level + 1; + } + return bitSet; + } } diff --git a/src/main/java/htsjdk/samtools/IndexFileBuffer.java b/src/main/java/htsjdk/samtools/IndexFileBuffer.java new file mode 100644 index 0000000000..e8e421b759 --- /dev/null +++ b/src/main/java/htsjdk/samtools/IndexFileBuffer.java @@ -0,0 +1,17 @@ +package htsjdk.samtools; + +import java.io.Closeable; + +/** + * Interface for managing index files and streams. Has to be implemented + * by all classes that directly access index data. + */ +interface IndexFileBuffer extends Closeable { + void readBytes(final byte[] bytes); + int readInteger(); + long readLong(); + void skipBytes(final int count); + void seek(final long position); + long position(); + void close(); +} \ No newline at end of file diff --git a/src/main/java/htsjdk/samtools/IndexFileBufferFactory.java b/src/main/java/htsjdk/samtools/IndexFileBufferFactory.java new file mode 100644 index 0000000000..d3c9d1cc0a --- /dev/null +++ b/src/main/java/htsjdk/samtools/IndexFileBufferFactory.java @@ -0,0 +1,21 @@ +package htsjdk.samtools; + +import htsjdk.samtools.util.IOUtil; +import htsjdk.samtools.util.RuntimeIOException; + +import java.io.File; +import java.io.IOException; + +class IndexFileBufferFactory { + + static IndexFileBuffer getBuffer(File file, boolean enableMemoryMapping) { + boolean isCompressed; + try { + isCompressed = IOUtil.isBlockCompressed(file.toPath()); + } catch (IOException ioe) { + throw (new RuntimeIOException(ioe)); + } + + return isCompressed ? new CompressedIndexFileBuffer(file) : (enableMemoryMapping ? new MemoryMappedFileBuffer(file) : new RandomAccessFileBuffer(file)); + } +} diff --git a/src/main/java/htsjdk/samtools/IndexStreamBuffer.java b/src/main/java/htsjdk/samtools/IndexStreamBuffer.java new file mode 100644 index 0000000000..5486dbe9c0 --- /dev/null +++ b/src/main/java/htsjdk/samtools/IndexStreamBuffer.java @@ -0,0 +1,85 @@ +package htsjdk.samtools; + +import htsjdk.samtools.seekablestream.SeekableStream; +import htsjdk.samtools.util.RuntimeIOException; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +class IndexStreamBuffer implements IndexFileBuffer { + private final SeekableStream in; + private final ByteBuffer tmpBuf; + + /** Continually reads from the provided {@link SeekableStream} into the buffer until the specified number of bytes are read, or + * until the stream is exhausted, throwing a {@link RuntimeIOException}. */ + private static void readFully(final SeekableStream in, final byte[] buffer, final int length) { + int read = 0; + while (read < length) { + final int readThisLoop; + try { + readThisLoop = in.read(buffer, read, length - read); + } catch (final IOException e) { + throw new RuntimeIOException(e); + } + if (readThisLoop == -1) break; + read += readThisLoop; + } + if (read != length) throw new RuntimeIOException("Expected to read " + length + " bytes, but expired stream after " + read + "."); + } + + public IndexStreamBuffer(final SeekableStream s) { + in = s; + tmpBuf = ByteBuffer.allocate(8); // Enough to fit a long. + tmpBuf.order(ByteOrder.LITTLE_ENDIAN); + } + + @Override + public void close() { + try { in.close(); } + catch (final IOException e) { throw new RuntimeIOException(e); } + } + + @Override + public void readBytes(final byte[] bytes) { + readFully(in, bytes, bytes.length); + } + + @Override + public void seek(final long position) { + try { in.seek(position); } + catch (final IOException e) { throw new RuntimeIOException(e); } + } + + @Override + public int readInteger() { + readFully(in, tmpBuf.array(), 4); + return tmpBuf.getInt(0); + } + + @Override + public long readLong() { + readFully(in, tmpBuf.array(), 8); + return tmpBuf.getLong(0); + } + + @Override + public void skipBytes(final int count) { + try { + for (int s = count; s > 0;) { + final int skipped = (int)in.skip(s); + if (skipped <= 0) { + throw new RuntimeIOException("Failed to skip " + s); + } + s -= skipped; + } + } catch (final IOException e) { throw new RuntimeIOException(e); } + } + + @Override + public long position() { + try { + return (int) in.position(); + } catch (final IOException e) { throw new RuntimeIOException(e); } + } +} diff --git a/src/main/java/htsjdk/samtools/MemoryMappedFileBuffer.java b/src/main/java/htsjdk/samtools/MemoryMappedFileBuffer.java new file mode 100644 index 0000000000..39d12adbf2 --- /dev/null +++ b/src/main/java/htsjdk/samtools/MemoryMappedFileBuffer.java @@ -0,0 +1,66 @@ +package htsjdk.samtools; + +import htsjdk.samtools.util.RuntimeIOException; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.ByteOrder; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; + +/** + * Traditional implementation of BAM index file access using memory mapped files. + */ +class MemoryMappedFileBuffer implements IndexFileBuffer { + private MappedByteBuffer mFileBuffer; + + MemoryMappedFileBuffer(final File file) { + try { + // Open the file stream. + final FileInputStream fileStream = new FileInputStream(file); + final FileChannel fileChannel = fileStream.getChannel(); + mFileBuffer = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0L, fileChannel.size()); + mFileBuffer.order(ByteOrder.LITTLE_ENDIAN); + fileChannel.close(); + fileStream.close(); + } catch (final IOException exc) { + throw new RuntimeIOException(exc.getMessage(), exc); + } + } + + @Override + public void readBytes(final byte[] bytes) { + mFileBuffer.get(bytes); + } + + @Override + public int readInteger() { + return mFileBuffer.getInt(); + } + + @Override + public long readLong() { + return mFileBuffer.getLong(); + } + + @Override + public void skipBytes(final int count) { + mFileBuffer.position(mFileBuffer.position() + count); + } + + @Override + public void seek(final long position) { + mFileBuffer.position((int)position); + } + + @Override + public long position() { + return mFileBuffer.position(); + } + + @Override + public void close() { + mFileBuffer = null; + } +} diff --git a/src/main/java/htsjdk/samtools/RandomAccessFileBuffer.java b/src/main/java/htsjdk/samtools/RandomAccessFileBuffer.java new file mode 100644 index 0000000000..3eb271a0ff --- /dev/null +++ b/src/main/java/htsjdk/samtools/RandomAccessFileBuffer.java @@ -0,0 +1,129 @@ +package htsjdk.samtools; + +import htsjdk.samtools.util.RuntimeIOException; + +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; + +/** + * Alternative implementation of BAM index file access using regular I/O instead of memory mapping. + * + * This implementation can be more scalable for certain applications that need to access large numbers of BAM files. + * Java provides no way to explicitly release a memory mapping. Instead, you need to wait for the garbage collector + * to finalize the MappedByteBuffer. Because of this, when accessing many BAM files or when querying many BAM files + * sequentially, you cannot easily control the physical memory footprint of the java process. + * This can limit scalability and can have bad interactions with load management software like LSF, forcing you + * to reserve enough physical memory for a worst case scenario. + * The use of regular I/O allows you to trade somewhat slower performance for a small, fixed memory footprint + * if that is more suitable for your application. + */ +class RandomAccessFileBuffer implements IndexFileBuffer { + private static final int PAGE_SIZE = 4 * 1024; + private static final int PAGE_OFFSET_MASK = PAGE_SIZE-1; + private static final int PAGE_MASK = ~PAGE_OFFSET_MASK; + private static final int INVALID_PAGE = 1; + private final File mFile; + private RandomAccessFile mRandomAccessFile; + private final int mFileLength; + private long mFilePointer = 0; + private int mCurrentPage = INVALID_PAGE; + private final byte[] mBuffer = new byte[PAGE_SIZE]; + + RandomAccessFileBuffer(final File file) { + mFile = file; + try { + mRandomAccessFile = new RandomAccessFile(file, "r"); + final long fileLength = mRandomAccessFile.length(); + if (fileLength > Integer.MAX_VALUE) { + throw new RuntimeIOException("BAM index file " + mFile + " is too large: " + fileLength); + } + mFileLength = (int) fileLength; + } catch (final IOException exc) { + throw new RuntimeIOException(exc.getMessage(), exc); + } + } + + @Override + public void readBytes(final byte[] bytes) { + int resultOffset = 0; + int resultLength = bytes.length; + if (mFilePointer + resultLength > mFileLength) { + throw new RuntimeIOException("Attempt to read past end of BAM index file (file is truncated?): " + mFile); + } + while (resultLength > 0) { + loadPage(mFilePointer); + final int pageOffset = (int)mFilePointer & PAGE_OFFSET_MASK; + final int copyLength = Math.min(resultLength, PAGE_SIZE - pageOffset); + System.arraycopy(mBuffer, pageOffset, bytes, resultOffset, copyLength); + mFilePointer += copyLength; + resultOffset += copyLength; + resultLength -= copyLength; + } + } + + @Override + public int readInteger() { + // This takes advantage of the fact that integers in BAM index files are always 4-byte aligned. + loadPage(mFilePointer); + final int pageOffset = (int)mFilePointer & PAGE_OFFSET_MASK; + mFilePointer += 4; + return((mBuffer[pageOffset + 0] & 0xFF) | + ((mBuffer[pageOffset + 1] & 0xFF) << 8) | + ((mBuffer[pageOffset + 2] & 0xFF) << 16) | + ((mBuffer[pageOffset + 3] & 0xFF) << 24)); + } + + @Override + public long readLong() { + // BAM index files are always 4-byte aligned, but not necessrily 8-byte aligned. + // So, rather than fooling with complex page logic we simply read the long in two 4-byte chunks. + final long lower = readInteger(); + final long upper = readInteger(); + return ((upper << 32) | (lower & 0xFFFFFFFFL)); + } + + @Override + public void skipBytes(final int count) { + mFilePointer += count; + } + + @Override + public void seek(final long position) { + mFilePointer = position; + } + + @Override + public long position() { + return mFilePointer; + } + + @Override + public void close() { + mFilePointer = 0; + mCurrentPage = INVALID_PAGE; + if (mRandomAccessFile != null) { + try { + mRandomAccessFile.close(); + } catch (final IOException exc) { + throw new RuntimeIOException(exc.getMessage(), exc); + } + mRandomAccessFile = null; + } + } + + private void loadPage(final long filePosition) { + final int page = (int)filePosition & PAGE_MASK; + if (page == mCurrentPage) { + return; + } + try { + mRandomAccessFile.seek(page); + final int readLength = Math.min(mFileLength - page, PAGE_SIZE); + mRandomAccessFile.readFully(mBuffer, 0, readLength); + mCurrentPage = page; + } catch (final IOException exc) { + throw new RuntimeIOException("Exception reading BAM index file " + mFile + ": " + exc.getMessage(), exc); + } + } +} diff --git a/src/main/java/htsjdk/samtools/SAMFileWriterFactory.java b/src/main/java/htsjdk/samtools/SAMFileWriterFactory.java index 1fb0b89e89..048f379003 100644 --- a/src/main/java/htsjdk/samtools/SAMFileWriterFactory.java +++ b/src/main/java/htsjdk/samtools/SAMFileWriterFactory.java @@ -636,7 +636,7 @@ private CRAMFileWriter createCRAMWriterWithSettings( log.warn("Cannot create index for CRAM because output file is not a regular file: " + outputFile.toUri()); } else { - final Path indexPath = IOUtil.addExtension(outputFile, BAMIndex.BAMIndexSuffix); + final Path indexPath = IOUtil.addExtension(outputFile, BAMIndex.BAI_INDEX_SUFFIX); try { indexOS = Files.newOutputStream(indexPath); } diff --git a/src/main/java/htsjdk/samtools/SamFiles.java b/src/main/java/htsjdk/samtools/SamFiles.java index 874fc1027e..f6259e09d2 100644 --- a/src/main/java/htsjdk/samtools/SamFiles.java +++ b/src/main/java/htsjdk/samtools/SamFiles.java @@ -63,14 +63,20 @@ private static Path unsymlinkAndLookForIndex(Path samPath) { } } - private static Path lookForIndex(final Path samPath) {// If input is foo.bam, look for foo.bai + private static Path lookForIndex(final Path samPath) {// If input is foo.bam, look for foo.bai or foo.csi Path indexPath; final String fileName = samPath.getFileName().toString(); // works for all path types (e.g. HDFS) if (fileName.endsWith(BamFileIoUtils.BAM_FILE_EXTENSION)) { - final String bai = fileName.substring(0, fileName.length() - BamFileIoUtils.BAM_FILE_EXTENSION.length()) + BAMIndex.BAMIndexSuffix; + final String bai = fileName.substring(0, fileName.length() - BamFileIoUtils.BAM_FILE_EXTENSION.length()) + BAMIndex.BAI_INDEX_SUFFIX; + final String csi = fileName.substring(0, fileName.length() - BamFileIoUtils.BAM_FILE_EXTENSION.length()) + BAMIndex.CSI_INDEX_SUFFIX; indexPath = samPath.resolveSibling(bai); if (Files.isRegularFile(indexPath)) { // works for all path types (e.g. HDFS) return indexPath; + } else { // if there is no .bai index, look for .csi index + indexPath = samPath.resolveSibling(csi); + if (Files.isRegularFile(indexPath)) { + return indexPath; + } } @@ -88,9 +94,14 @@ private static Path lookForIndex(final Path samPath) {// If input is foo.bam, lo } // If foo.bai doesn't exist look for foo.bam.bai or foo.cram.bai - indexPath = samPath.resolveSibling(fileName + BAMIndex.BAMIndexSuffix); + indexPath = samPath.resolveSibling(fileName + BAMIndex.BAI_INDEX_SUFFIX); if (Files.isRegularFile(indexPath)) { return indexPath; + } else { + indexPath = samPath.resolveSibling(fileName + BAMIndex.CSI_INDEX_SUFFIX); + if (Files.isRegularFile(indexPath)) { + return indexPath; + } } return null; diff --git a/src/main/java/htsjdk/samtools/SamIndexes.java b/src/main/java/htsjdk/samtools/SamIndexes.java index a888811f12..7cd634c5a7 100644 --- a/src/main/java/htsjdk/samtools/SamIndexes.java +++ b/src/main/java/htsjdk/samtools/SamIndexes.java @@ -16,9 +16,10 @@ * Created by vadim on 14/08/2015. */ public enum SamIndexes { - BAI(BAMIndex.BAMIndexSuffix, "BAI\1".getBytes()), + BAI(BAMIndex.BAI_INDEX_SUFFIX, "BAI\1".getBytes()), // CRAI is gzipped text, so it's magic is same as {@link java.util.zip.GZIPInputStream.GZIP_MAGIC} - CRAI(CRAIIndex.CRAI_INDEX_SUFFIX, new byte[]{(byte) 0x1f, (byte) 0x8b}); + CRAI(CRAIIndex.CRAI_INDEX_SUFFIX, new byte[]{(byte) 0x1f, (byte) 0x8b}), + CSI(BAMIndex.CSI_INDEX_SUFFIX, "CSI\1".getBytes()); public final String fileNameSuffix; public final byte[] magic; @@ -39,6 +40,9 @@ public static InputStream openIndexUrlAsBaiOrNull(final URL url, final SAMSequen if (url.getFile().toLowerCase().endsWith(CRAI.fileNameSuffix.toLowerCase())) { return CRAIIndex.openCraiFileAsBaiStream(url.openStream(), dictionary); } + if (url.getFile().toLowerCase().endsWith(CSI.fileNameSuffix.toLowerCase())) { + return url.openStream(); + } return null; } @@ -61,6 +65,14 @@ public static InputStream asBaiStreamOrNull(final InputStream inputStream, final bis.reset(); } + bis.mark(CSI.magic.length); + if (doesStreamStartWith(bis, CSI.magic)) { + bis.reset(); + return bis; + } else { + bis.reset(); + } + return null; } @@ -80,6 +92,12 @@ public static SeekableStream asBaiSeekableStreamOrNull(final SeekableStream inpu bis.reset(); } + bis.seek(0); + if (doesStreamStartWith(bis, CSI.magic)) { + bis.seek(0); + return bis; + } + return null; } diff --git a/src/main/java/htsjdk/samtools/SamReader.java b/src/main/java/htsjdk/samtools/SamReader.java index e0c147b934..7ec85ac099 100644 --- a/src/main/java/htsjdk/samtools/SamReader.java +++ b/src/main/java/htsjdk/samtools/SamReader.java @@ -85,6 +85,7 @@ public String toString() { public static final Type CRAM_TYPE = new TypeImpl("CRAM", "cram", "crai"); public static final Type BAM_TYPE = new TypeImpl("BAM", "bam", "bai"); public static final Type SAM_TYPE = new TypeImpl("SAM", "sam", null); + public static final Type BAM_CSI_TYPE = new TypeImpl("BAM", "bam", "csi"); } /** diff --git a/src/test/java/htsjdk/samtools/AbstractBAMFileIndexTest.java b/src/test/java/htsjdk/samtools/AbstractBAMFileIndexTest.java index cf451b86ae..b1c5bf88e3 100644 --- a/src/test/java/htsjdk/samtools/AbstractBAMFileIndexTest.java +++ b/src/test/java/htsjdk/samtools/AbstractBAMFileIndexTest.java @@ -2,18 +2,22 @@ import htsjdk.HtsjdkTest; import htsjdk.samtools.seekablestream.SeekableStream; +import org.testng.Assert; import org.testng.annotations.Test; +import java.io.File; import java.io.IOException; public class AbstractBAMFileIndexTest extends HtsjdkTest { + private static final AbstractBAMFileIndex afi = new DiskBasedBAMFileIndex(new File("src/test/resources/htsjdk/samtools/BAMFileIndexTest/index_test.bam.bai"), null); + /** * @see https://github.com/samtools/htsjdk/issues/73 */ @Test public static void avoidDataExhaustionTest() { - final AbstractBAMFileIndex.IndexStreamBuffer buffer = new AbstractBAMFileIndex.IndexStreamBuffer(new SeekableStream() { + final IndexStreamBuffer buffer = new IndexStreamBuffer(new SeekableStream() { @Override public long length() { return 0; @@ -60,4 +64,39 @@ public int read() throws IOException { buffer.readInteger(); buffer.readBytes(new byte[10000]); } + + @Test + public static void testGetNumIndexLevels() { + Assert.assertEquals(AbstractBAMFileIndex.getNumIndexLevels(), 6); + } + + @Test + public static void testGetFirstBinInLevelOK() { + Assert.assertEquals(AbstractBAMFileIndex.getFirstBinInLevel(0), 0); + Assert.assertEquals(AbstractBAMFileIndex.getFirstBinInLevel(1), 1); + Assert.assertEquals(AbstractBAMFileIndex.getFirstBinInLevel(2), 9); + Assert.assertEquals(AbstractBAMFileIndex.getFirstBinInLevel(3), 73); + Assert.assertEquals(AbstractBAMFileIndex.getFirstBinInLevel(4), 585); + Assert.assertEquals(AbstractBAMFileIndex.getFirstBinInLevel(5), 4681); + } + + @Test (expectedExceptions = SAMException.class) + public static void testGetFirstBinInLevelFail() { + AbstractBAMFileIndex.getFirstBinInLevel(6); + } + + @Test + public static void testGetLevelSizeOK() { + Assert.assertEquals(afi.getLevelSize(0), 1); + Assert.assertEquals(afi.getLevelSize(1), 8); + Assert.assertEquals(afi.getLevelSize(2), 64); + Assert.assertEquals(afi.getLevelSize(3), 512); + Assert.assertEquals(afi.getLevelSize(4), 4096); + Assert.assertEquals(afi.getLevelSize(5), 32768); + } + + @Test (expectedExceptions = SAMException.class) + public static void testGetLevelSizeFail() { + afi.getLevelSize(6); + } } diff --git a/src/test/java/htsjdk/samtools/BAMFileReaderTest.java b/src/test/java/htsjdk/samtools/BAMFileReaderTest.java new file mode 100644 index 0000000000..426b164f9e --- /dev/null +++ b/src/test/java/htsjdk/samtools/BAMFileReaderTest.java @@ -0,0 +1,306 @@ +package htsjdk.samtools; + +import htsjdk.HtsjdkTest; +import htsjdk.samtools.seekablestream.ByteArraySeekableStream; +import htsjdk.samtools.util.CloseableIterator; +import htsjdk.samtools.util.CoordMath; +import org.testng.Assert; +import org.testng.annotations.BeforeTest; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +public class BAMFileReaderTest extends HtsjdkTest { + private final static File bamFile = new File("src/test/resources/htsjdk/samtools/BAMFileIndexTest/index_test.bam"); + private final static File baiFileIndex = new File(bamFile.getPath() + ".bai"); + private final static File csiFileIndex = new File(bamFile.getPath() + ".csi"); + private final static File iiiFileIndex = new File(bamFile.getPath() + ".iii"); + private final static int nofMappedReads = 9721; + private final static int nofUnmappedReads = 279 ; + private final static int noChrMReads = 23; + private final static int noChrMReadsContained = 9; + private final static int noChrMReadsOverlapped = 10; + + private static BAMFileReader bamFileReaderBAI; + private static BAMFileReader bamFileReaderCSI; + private static BAMFileReader bamFileReaderWrong; + private static BAMFileReader bamFileReaderNull; + + @BeforeTest + public void init() throws IOException { + bamFileReaderBAI = new BAMFileReader(bamFile, baiFileIndex, true, false, ValidationStringency.DEFAULT_STRINGENCY, DefaultSAMRecordFactory.getInstance()); + bamFileReaderCSI = new BAMFileReader(bamFile, csiFileIndex, true, false, ValidationStringency.DEFAULT_STRINGENCY, DefaultSAMRecordFactory.getInstance()); + bamFileReaderWrong = new BAMFileReader(bamFile, iiiFileIndex, true, false, ValidationStringency.DEFAULT_STRINGENCY, DefaultSAMRecordFactory.getInstance()); + bamFileReaderNull = new BAMFileReader(bamFile, null, true, false, ValidationStringency.DEFAULT_STRINGENCY, DefaultSAMRecordFactory.getInstance()); + } + + @Test + public static void testGetIndexTypeOK() { + BAMIndexMetaData.printIndexStats(bamFile); + Assert.assertTrue(bamFileReaderBAI.getIndexType().equals(SamIndexes.BAI)); + Assert.assertTrue(bamFileReaderCSI.getIndexType().equals(SamIndexes.CSI)); + } + + @Test (expectedExceptions = SAMFormatException.class) + public static void testGetIndexTypeException() { + Assert.assertTrue(bamFileReaderWrong.getIndexType().equals(SamIndexes.BAI)); + } + + @Test + public static void testGetIndexTypeDefault() { + Assert.assertTrue(bamFileReaderNull.getIndexType().equals(SamIndexes.BAI)); + } + + @Test + public static void testGetIndexOK() { + Assert.assertTrue(bamFileReaderBAI.hasIndex()); + Assert.assertTrue(bamFileReaderCSI.hasIndex()); + Assert.assertTrue(bamFileReaderNull.hasIndex()); + Assert.assertNotNull(bamFileReaderBAI.getIndex()); + Assert.assertNotNull(bamFileReaderCSI.getIndex()); + Assert.assertNotNull(bamFileReaderNull.getIndex()); + } + + @Test (expectedExceptions = SAMFormatException.class) + public static void testGetIndexException() { + Assert.assertNotNull(bamFileReaderWrong.getIndex()); + } + + @Test + public static void testQueryMapped() throws IOException { + try (SamReader samReader = SamReaderFactory.makeDefault().open(bamFile); + SAMRecordIterator samRecordIterator = samReader.iterator()) + { + Assert.assertEquals(samReader.getFileHeader().getSortOrder(), SAMFileHeader.SortOrder.coordinate); + + int counter = 0; + while (samRecordIterator.hasNext()) { + SAMRecord samRecord = samRecordIterator.next(); + if (samRecord.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + break; + } + if (counter++ % 100 > 1) { // test only 1st and 2nd in every 100 to speed the test up: + continue; + } + String sam1 = samRecord.getSAMString(); + + CloseableIterator iterator = bamFileReaderBAI.queryAlignmentStart( + samRecord.getReferenceName(), + samRecord.getAlignmentStart()); + + Assert.assertTrue(iterator.hasNext(), counter + ": " + sam1); + SAMRecord bamRecord = iterator.next(); + String sam2 = bamRecord.getSAMString(); + Assert.assertEquals(samRecord.getReferenceName(), bamRecord.getReferenceName(), sam1 + sam2); + + // default 'overlap' is true, so test records intersect the query: + Assert.assertTrue(CoordMath.overlaps( + bamRecord.getAlignmentStart(), + bamRecord.getAlignmentEnd(), + samRecord.getAlignmentStart(), + samRecord.getAlignmentEnd()), + sam1 + sam2); + + iterator.close(); + } + Assert.assertEquals(counter, nofMappedReads); + } + + // test the reader with the CSI index + try (SamReader samReader = SamReaderFactory.makeDefault().open(bamFile); + SAMRecordIterator samRecordIterator = samReader.iterator()) + { + Assert.assertEquals(samReader.getFileHeader().getSortOrder(), SAMFileHeader.SortOrder.coordinate); + + int counter = 0; + while (samRecordIterator.hasNext()) { + SAMRecord samRecord = samRecordIterator.next(); + if (samRecord.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + break; + } + if (counter++ % 100 > 1) { // test only 1st and 2nd in every 100 to speed the test up: + continue; + } + String sam1 = samRecord.getSAMString(); + + CloseableIterator iterator = bamFileReaderCSI.queryAlignmentStart( + samRecord.getReferenceName(), + samRecord.getAlignmentStart()); + + Assert.assertTrue(iterator.hasNext(), counter + ": " + sam1); + SAMRecord bamRecord = iterator.next(); + String sam2 = bamRecord.getSAMString(); + Assert.assertEquals(samRecord.getReferenceName(), bamRecord.getReferenceName(), sam1 + sam2); + + // default 'overlap' is true, so test records intersect the query: + Assert.assertTrue(CoordMath.overlaps( + bamRecord.getAlignmentStart(), + bamRecord.getAlignmentEnd(), + samRecord.getAlignmentStart(), + samRecord.getAlignmentEnd()), + sam1 + sam2); + + iterator.close(); + } + Assert.assertEquals(counter, nofMappedReads); + } + } + + @Test + public static void testQueryUnmapped() { + int counter = 0; + CloseableIterator baiIterator = bamFileReaderBAI.queryUnmapped(); + CloseableIterator csiIterator = bamFileReaderCSI.queryUnmapped(); + + Assert.assertTrue(baiIterator.hasNext()); + while (baiIterator.hasNext()) { + Assert.assertTrue(csiIterator.hasNext()); + + SAMRecord r1 = baiIterator.next(); + SAMRecord r2 = csiIterator.next(); + Assert.assertEquals(r1.getReadName(), r2.getReadName()); + Assert.assertEquals(r1.getBaseQualityString(), r2.getBaseQualityString()); + + counter++; + } + Assert.assertFalse(csiIterator.hasNext()); + Assert.assertEquals(counter, nofUnmappedReads); + + baiIterator.close(); + csiIterator.close(); + } + + @Test + public static void testQueryInterval() { + QueryInterval[] query = new QueryInterval[]{new QueryInterval(0, 1519, 1520), new QueryInterval(1, 470535, 470536)}; + final CloseableIterator baiIterator = bamFileReaderBAI.query(query, false); + final CloseableIterator csiIterator = bamFileReaderCSI.query(query, false); + + Assert.assertTrue(baiIterator.hasNext()); + Assert.assertTrue(csiIterator.hasNext()); + SAMRecord r1 = baiIterator.next(); + SAMRecord r2 = csiIterator.next(); + Assert.assertEquals(r1.getReadName(), "3968040"); + Assert.assertEquals(r2.getReadName(), "3968040"); + + r1 = baiIterator.next(); + r2 = csiIterator.next(); + Assert.assertEquals(r1.getReadName(), "140419"); + Assert.assertEquals(r2.getReadName(), "140419"); + + baiIterator.close(); + csiIterator.close(); + } + + @Test + public static void testQueryContained() { + int counter = 0; + CloseableIterator baiIterator = bamFileReaderBAI.query("chrM", 1500, -1, true); + CloseableIterator csiIterator = bamFileReaderCSI.query("chrM", 1500, -1, true); + + Assert.assertTrue(baiIterator.hasNext()); + while (baiIterator.hasNext()) { + Assert.assertTrue(csiIterator.hasNext()); + + SAMRecord r1 = baiIterator.next(); + SAMRecord r2 = csiIterator.next(); + Assert.assertEquals(r1.getReadName(), r2.getReadName()); + Assert.assertEquals(r1.getBaseQualityString(), r2.getBaseQualityString()); + + counter++; + } + Assert.assertFalse(csiIterator.hasNext()); + Assert.assertEquals(counter, noChrMReads); + + baiIterator.close(); + csiIterator.close(); + + counter = 0; + baiIterator = bamFileReaderBAI.query("chrM", 1500, 10450, true); + csiIterator = bamFileReaderCSI.query("chrM", 1500, 10450, true); + + Assert.assertTrue(baiIterator.hasNext()); + while (baiIterator.hasNext()) { + Assert.assertTrue(csiIterator.hasNext()); + + SAMRecord r1 = baiIterator.next(); + SAMRecord r2 = csiIterator.next(); + Assert.assertEquals(r1.getReadName(), r2.getReadName()); + Assert.assertEquals(r1.getBaseQualityString(), r2.getBaseQualityString()); + + counter++; + } + Assert.assertFalse(csiIterator.hasNext()); + Assert.assertEquals(counter, noChrMReadsContained); + + baiIterator.close(); + csiIterator.close(); + } + + @Test + public static void testQueryOverlapped() { + int counter = 0; + CloseableIterator baiIterator = bamFileReaderBAI.query("chrM", 1500, 10450, false); + CloseableIterator csiIterator = bamFileReaderCSI.query("chrM", 1500, 10450, false); + + Assert.assertTrue(baiIterator.hasNext()); + while (baiIterator.hasNext()) { + Assert.assertTrue(csiIterator.hasNext()); + + SAMRecord r1 = baiIterator.next(); + SAMRecord r2 = csiIterator.next(); + Assert.assertEquals(r1.getReadName(), r2.getReadName()); + Assert.assertEquals(r1.getBaseQualityString(), r2.getBaseQualityString()); + + counter++; + } + Assert.assertFalse(csiIterator.hasNext()); + Assert.assertEquals(counter, noChrMReadsOverlapped); + + baiIterator.close(); + csiIterator.close(); + } + + @Test + public static void testQueryAlignmentStartNone() throws IOException { + // the first read starts from 1519 + final CloseableIterator baiIterator = bamFileReaderBAI.queryAlignmentStart("chrM", 1500); + final CloseableIterator csiIterator = bamFileReaderCSI.queryAlignmentStart("chrM", 1500); + + Assert.assertFalse(baiIterator.hasNext()); + Assert.assertFalse(csiIterator.hasNext()); + + baiIterator.close(); + csiIterator.close(); + } + + @Test + public static void testQueryAlignmentStartOne() throws IOException { + // one read on chrM starts from 9060 + final CloseableIterator baiIterator = bamFileReaderBAI.queryAlignmentStart("chrM", 9060); + final CloseableIterator csiIterator = bamFileReaderCSI.queryAlignmentStart("chrM", 9060); + + Assert.assertTrue(baiIterator.hasNext()); + Assert.assertTrue(csiIterator.hasNext()); + + SAMRecord r1 = baiIterator.next(); + SAMRecord r2 = csiIterator.next(); + Assert.assertEquals(r1.getReadName(), r2.getReadName()); + Assert.assertEquals(r1.getBaseQualityString(), r2.getBaseQualityString()); + + + Assert.assertFalse(baiIterator.hasNext()); + Assert.assertFalse(csiIterator.hasNext()); + + baiIterator.close(); + csiIterator.close(); + } + + @Test + public static void testFindVirtualOffsetOfFirstRecord() throws IOException { + Assert.assertEquals(BAMFileReader.findVirtualOffsetOfFirstRecord(bamFile), 8384); + } + +} diff --git a/src/test/java/htsjdk/samtools/BAMIndexValidatorTest.java b/src/test/java/htsjdk/samtools/BAMIndexValidatorTest.java new file mode 100644 index 0000000000..4387acdc02 --- /dev/null +++ b/src/test/java/htsjdk/samtools/BAMIndexValidatorTest.java @@ -0,0 +1,32 @@ +package htsjdk.samtools; + +import htsjdk.HtsjdkTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; + +public class BAMIndexValidatorTest extends HtsjdkTest { + + private static final File BAM_FILE = new File("src/test/resources/htsjdk/samtools/BAMFileIndexTest/index_test.bam"); + private static final File BAI_FILE = new File(BAM_FILE.getPath() + ".bai"); + private static final File CSI_FILE = new File(BAM_FILE.getPath() + ".csi"); + + @Test + public void exhaustivelyTestIndexTest () throws IOException { + + BAMFileReader bamFileReader1 = new BAMFileReader(BAM_FILE, BAI_FILE, true, false, ValidationStringency.DEFAULT_STRINGENCY, new DefaultSAMRecordFactory()); + bamFileReader1.enableIndexCaching(true); + BAMFileReader bamFileReader2 = new BAMFileReader(BAM_FILE, CSI_FILE, true, false, ValidationStringency.DEFAULT_STRINGENCY, new DefaultSAMRecordFactory()); + + final SamReader samFileReader1 = new SamReader.PrimitiveSamReaderToSamReaderAdapter(bamFileReader1, null); + final SamReader samFileReader2 = new SamReader.PrimitiveSamReaderToSamReaderAdapter(bamFileReader2, null); + + int baiCount = BamIndexValidator.exhaustivelyTestIndex(samFileReader1); + int csiCount = BamIndexValidator.exhaustivelyTestIndex(samFileReader2); + + Assert.assertEquals(baiCount, csiCount); + } + +} diff --git a/src/test/java/htsjdk/samtools/CSIIndexTest.java b/src/test/java/htsjdk/samtools/CSIIndexTest.java new file mode 100644 index 0000000000..388f70f146 --- /dev/null +++ b/src/test/java/htsjdk/samtools/CSIIndexTest.java @@ -0,0 +1,294 @@ +package htsjdk.samtools; + +import htsjdk.HtsjdkTest; +import org.testng.Assert; +import org.testng.annotations.BeforeTest; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Paths; +import java.util.BitSet; + +public class CSIIndexTest extends HtsjdkTest { + + private static DiskBasedBAMFileIndex bai; + private static CSIIndex csi; + private static CSIIndex mcsi; + private static CSIIndex ucsi; + private static DiskBasedBAMFileIndex ubai; + + private static Bin bin1 = new Bin(0, 0); + private static Bin bin2 = new Bin(0, 1); + private static Bin bin3 = new Bin(1, 0); + private static Bin bin4 = new Bin(1, 1); + private static Bin bin5 = new Bin(1, 9); + private static Bin bin6 = new Bin(1, 586); + private static Bin bin7 = new Bin(1, 80); + private static Bin bin8 = new Bin(1, 4689); + private static Bin bin9 = new Bin(1, 135853); + private static Bin bin10 = new Bin(1, 163); + + @BeforeTest + public void init() { + bai = new DiskBasedBAMFileIndex(new File("src/test/resources/htsjdk/samtools/BAMFileIndexTest/index_test.bam.bai"), null); + csi = new CSIIndex(new File("src/test/resources/htsjdk/samtools/BAMFileIndexTest/index_test.bam.csi"), false, null); + mcsi = new CSIIndex(new File("src/test/resources/htsjdk/samtools/BAMFileIndexTest/index_test.bam.csi"), true, null); + try { + ucsi = new CSIIndex(Paths.get("src/test/resources/htsjdk/samtools/BAMFileIndexTest/uncompressed_index.bam.csi"), null); + } catch (IOException e) { + e.printStackTrace(); + } + ubai = new DiskBasedBAMFileIndex(new File("src/test/resources/htsjdk/samtools/BAMFileIndexTest/uncompressed_index.bam.bai"),null); + } + + + @Test + public static void testGetNumIndexLevels() { + Assert.assertEquals(csi.getBinDepth(), 6); + Assert.assertEquals(ucsi.getBinDepth(), 7); + } + @Test + public static void testGetMinShift() { + Assert.assertEquals(csi.getMinShift(), 14); + Assert.assertEquals(ucsi.getMinShift(), 12); + } + @Test + public static void testGetMaxBins() { + Assert.assertEquals(csi.getMaxBins(), 37449); + Assert.assertEquals(ucsi.getMaxBins(), 299593); + } + @Test + public static void testGetMaxSpan() { + Assert.assertEquals(csi.getMaxSpan(),512*1024*1024); + Assert.assertEquals(ucsi.getMaxSpan(),1024*1024*1024); + } + + @Test + public static void testGetFirstBinInLevelOK() { + Assert.assertEquals(csi.getFirstBinInLevelForCSI(0), 0); + Assert.assertEquals(csi.getFirstBinInLevelForCSI(1), 1); + Assert.assertEquals(csi.getFirstBinInLevelForCSI(2), 9); + Assert.assertEquals(csi.getFirstBinInLevelForCSI(3), 73); + Assert.assertEquals(csi.getFirstBinInLevelForCSI(4), 585); + Assert.assertEquals(csi.getFirstBinInLevelForCSI(5), 4681); + + Assert.assertEquals(ucsi.getFirstBinInLevelForCSI(0), 0); + Assert.assertEquals(ucsi.getFirstBinInLevelForCSI(1), 1); + Assert.assertEquals(ucsi.getFirstBinInLevelForCSI(2), 9); + Assert.assertEquals(ucsi.getFirstBinInLevelForCSI(3), 73); + Assert.assertEquals(ucsi.getFirstBinInLevelForCSI(4), 585); + Assert.assertEquals(ucsi.getFirstBinInLevelForCSI(5), 4681); + Assert.assertEquals(ucsi.getFirstBinInLevelForCSI(6), 37449); + } + + @Test (expectedExceptions = SAMException.class) + public static void testGetFirstBinInLevelFail1() { + csi.getFirstBinInLevelForCSI(6); + } + + @Test (expectedExceptions = SAMException.class) + public static void testGetFirstBinInLevelFail2() { + ucsi.getFirstBinInLevelForCSI(7); + } + + @Test + public static void testGetLevelSizeOK() { + Assert.assertEquals(csi.getLevelSize(0), 1); + Assert.assertEquals(csi.getLevelSize(1), 8); + Assert.assertEquals(csi.getLevelSize(2), 64); + Assert.assertEquals(csi.getLevelSize(3), 512); + Assert.assertEquals(csi.getLevelSize(4), 4096); + Assert.assertEquals(csi.getLevelSize(5), 32768); + + Assert.assertEquals(ucsi.getLevelSize(0), 1); + Assert.assertEquals(ucsi.getLevelSize(1), 8); + Assert.assertEquals(ucsi.getLevelSize(2), 64); + Assert.assertEquals(ucsi.getLevelSize(3), 512); + Assert.assertEquals(ucsi.getLevelSize(4), 4096); + Assert.assertEquals(ucsi.getLevelSize(5), 32768); + Assert.assertEquals(ucsi.getLevelSize(6), 262144); + } + + @Test (expectedExceptions = SAMException.class) + public static void testGetLevelSizeFail1() { + csi.getLevelSize(6); + } + + @Test (expectedExceptions = SAMException.class) + public static void testGetLevelSizeFail12() { + csi.getLevelSize(7); + } + + @Test + public static void testGetAuxData() { + Assert.assertEquals(csi.getAuxData().length, 0); + } + + @Test + public static void testGetNReferences() { + Assert.assertEquals(csi.getNumberOfReferences(), 45); + } + + @Test + public static void testGetParentBinOK() { + Assert.assertEquals(csi.getParentBinNumber(4681), 585); + Assert.assertEquals(csi.getParentBinNumber(4688), 585); + Assert.assertEquals(csi.getParentBinNumber(4689), 586); + Assert.assertEquals(csi.getParentBinNumber(585), 73); + Assert.assertEquals(csi.getParentBinNumber(592), 73); + Assert.assertEquals(csi.getParentBinNumber(593), 74); + Assert.assertEquals(csi.getParentBinNumber(0), 0); + } + + @Test (expectedExceptions = SAMException.class) + public static void testGetParentBinFail() { + csi.getParentBinNumber(37449); + } + + @Test + public static void testRegionToBin() { + Assert.assertEquals(GenomicIndexUtil.regionToBin(12653, 1876491), GenomicIndexUtil.regionToBin(12653, 1876491, 14, 6)); + Assert.assertEquals(GenomicIndexUtil.regionToBin(1048576, 1146880), GenomicIndexUtil.regionToBin(1048576, 1146880, 14, 6)); + Assert.assertEquals(GenomicIndexUtil.regionToBin(536870912, 1073741824), GenomicIndexUtil.regionToBin(536870912, 1073741824, 14, 6)); + } + + @Test + public static void testGetNoCoorCount() { + long noCoorBai = bai.getNoCoordinateCount().longValue(); + long noCoorCsi = csi.getNoCoordinateCount().longValue(); + + Assert.assertEquals(noCoorBai, 279); + Assert.assertEquals(noCoorCsi, 279); + } + + @Test + public static void testGetLevelForBin() { + Assert.assertEquals(csi.getLevelForBin(bin1), 0); + Assert.assertEquals(csi.getLevelForBin(bin2), 1); + Assert.assertEquals(csi.getLevelForBin(bin3), 0); + Assert.assertEquals(csi.getLevelForBin(bin4), 1); + Assert.assertEquals(csi.getLevelForBin(bin5), 2); + Assert.assertEquals(csi.getLevelForBin(bin6), 4); + Assert.assertEquals(csi.getLevelForBin(bin7), 3); + Assert.assertEquals(csi.getLevelForBin(bin8), 5); + + Assert.assertEquals(ucsi.getLevelForBin(bin1), 0); + Assert.assertEquals(ucsi.getLevelForBin(bin2), 1); + Assert.assertEquals(ucsi.getLevelForBin(bin3), 0); + Assert.assertEquals(ucsi.getLevelForBin(bin4), 1); + Assert.assertEquals(ucsi.getLevelForBin(bin5), 2); + Assert.assertEquals(ucsi.getLevelForBin(bin6), 4); + Assert.assertEquals(ucsi.getLevelForBin(bin7), 3); + Assert.assertEquals(ucsi.getLevelForBin(bin8), 5); + Assert.assertEquals(ucsi.getLevelForBin(bin9), 6); + } + + @Test + public static void testGetParentBinNumber() { + Assert.assertEquals(csi.getParentBinNumber(bin1), 0); + Assert.assertEquals(csi.getParentBinNumber(bin2), 0); + Assert.assertEquals(csi.getParentBinNumber(bin3), 0); + Assert.assertEquals(csi.getParentBinNumber(bin4), 0); + Assert.assertEquals(csi.getParentBinNumber(bin5), 1); + Assert.assertEquals(csi.getParentBinNumber(bin6), 73); + Assert.assertEquals(csi.getParentBinNumber(bin7), 9); + Assert.assertEquals(csi.getParentBinNumber(bin8), 586); + + Assert.assertEquals(ucsi.getParentBinNumber(bin1), 0); + Assert.assertEquals(ucsi.getParentBinNumber(bin2), 0); + Assert.assertEquals(ucsi.getParentBinNumber(bin3), 0); + Assert.assertEquals(ucsi.getParentBinNumber(bin4), 0); + Assert.assertEquals(ucsi.getParentBinNumber(bin5), 1); + Assert.assertEquals(ucsi.getParentBinNumber(bin6), 73); + Assert.assertEquals(ucsi.getParentBinNumber(bin7), 9); + Assert.assertEquals(ucsi.getParentBinNumber(bin8), 586); + Assert.assertEquals(ucsi.getParentBinNumber(bin9), 16981); + Assert.assertEquals(ucsi.getParentBinNumber(16981), 2122); + Assert.assertEquals(ucsi.getParentBinNumber(2122), 265); + Assert.assertEquals(ucsi.getParentBinNumber(265), 33); + Assert.assertEquals(ucsi.getParentBinNumber(33), 4); + } + + @Test + public static void testGetFirstLocusInBin() { + Assert.assertEquals(csi.getFirstLocusInBin(bin1), 1); + Assert.assertEquals(csi.getFirstLocusInBin(bin2), 1); + Assert.assertEquals(csi.getFirstLocusInBin(bin3), 1); + Assert.assertEquals(csi.getFirstLocusInBin(bin4), 1); + Assert.assertEquals(csi.getFirstLocusInBin(bin5), 1); + Assert.assertEquals(csi.getFirstLocusInBin(bin6), (1<<17) + 1); + Assert.assertEquals(csi.getFirstLocusInBin(bin7), (1<<20)*7 + 1); + Assert.assertEquals(csi.getFirstLocusInBin(bin8), (1<<14)*8 + 1); + + Assert.assertEquals(ucsi.getFirstLocusInBin(bin1), 1); + Assert.assertEquals(ucsi.getFirstLocusInBin(bin2), 1); + Assert.assertEquals(ucsi.getFirstLocusInBin(bin3), 1); + Assert.assertEquals(ucsi.getFirstLocusInBin(bin4), 1); + Assert.assertEquals(ucsi.getFirstLocusInBin(bin5), 1); + Assert.assertEquals(ucsi.getFirstLocusInBin(bin6), (1<<18) + 1); + Assert.assertEquals(ucsi.getFirstLocusInBin(bin7), (1<<21)*7 + 1); + Assert.assertEquals(ucsi.getFirstLocusInBin(bin8), (1<<15)*8 + 1); + Assert.assertEquals(ucsi.getFirstLocusInBin(bin9), (1<<12)*98404 + 1); + } + + @Test + public static void testGetLastLocusInBin() { + Assert.assertEquals(csi.getLastLocusInBin(bin1), 1<<29); + Assert.assertEquals(csi.getLastLocusInBin(bin2), 1<<26); + Assert.assertEquals(csi.getLastLocusInBin(bin3), 1<<29); + Assert.assertEquals(csi.getLastLocusInBin(bin4), 1<<26); + Assert.assertEquals(csi.getLastLocusInBin(bin5), 1<<23); + Assert.assertEquals(csi.getLastLocusInBin(bin6), (1<<17)*2); + Assert.assertEquals(csi.getLastLocusInBin(bin7), (1<<20)*8); + Assert.assertEquals(csi.getLastLocusInBin(bin8), (1<<14)*9); + + Assert.assertEquals(ucsi.getLastLocusInBin(bin1), 1<<30); + Assert.assertEquals(ucsi.getLastLocusInBin(bin2), 1<<27); + Assert.assertEquals(ucsi.getLastLocusInBin(bin3), 1<<30); + Assert.assertEquals(ucsi.getLastLocusInBin(bin4), 1<<27); + Assert.assertEquals(ucsi.getLastLocusInBin(bin5), 1<<24); + Assert.assertEquals(ucsi.getLastLocusInBin(bin6), (1<<18)*2); + Assert.assertEquals(ucsi.getLastLocusInBin(bin7), (1<<21)*8); + Assert.assertEquals(ucsi.getLastLocusInBin(bin8), (1<<15)*9); + Assert.assertEquals(ucsi.getLastLocusInBin(bin9), (1<<12)*98405); + } + + @Test + public static void testRegionToBins() { + BitSet bs1 = GenomicIndexUtil.regionToBins((1<<12)*98403 + 4094, (1<<12)*98404 + 1, 12, 7 ); + Assert.assertTrue(bs1.get(bin9.getBinNumber()-1)); //135852 + Assert.assertTrue(bs1.get(bin9.getBinNumber())); //135853 + Assert.assertTrue(bs1.get(ucsi.getParentBinNumber(bin9.getBinNumber()))); //16981 + + BitSet bs2 = GenomicIndexUtil.regionToBins(939520000, 939529000, 12, 7 ); + Assert.assertTrue(bs2.get(0)); + Assert.assertTrue(bs2.get(7)); + Assert.assertTrue(bs2.get(8)); + Assert.assertTrue(bs2.get(64)); + Assert.assertTrue(bs2.get(65)); + Assert.assertTrue(bs2.get(520)); + Assert.assertTrue(bs2.get(521)); + Assert.assertTrue(bs2.get(4168)); + Assert.assertTrue(bs2.get(4169)); + Assert.assertTrue(bs2.get(33352)); + Assert.assertTrue(bs2.get(33353)); + Assert.assertTrue(bs2.get(266823)); + Assert.assertTrue(bs2.get(266824)); + Assert.assertTrue(bs2.get(266825)); + Assert.assertTrue(bs2.get(266826)); + } + + @Test + public static void testGetSpanOverlapping() { + BAMFileSpan bfs1 = ucsi.getSpanOverlapping(1, 939520000, 939529000); + BAMFileSpan bfs2 = ucsi.getSpanOverlapping(1, 240000000, 249228250); + BAMFileSpan bfs3 = ubai.getSpanOverlapping(1, 240000000, 249228250); + + Assert.assertTrue(bfs1.isEmpty()); + Assert.assertEquals(bfs2.getChunks(), bfs3.getChunks()); + + BAMFileSpan bfs4 = ucsi.getSpanOverlapping(bin10); + Assert.assertEquals(bfs4.getChunks().size(), 3); + } +} diff --git a/src/test/java/htsjdk/samtools/SamIndexesTest.java b/src/test/java/htsjdk/samtools/SamIndexesTest.java index f78b0f3719..83ecf3670c 100644 --- a/src/test/java/htsjdk/samtools/SamIndexesTest.java +++ b/src/test/java/htsjdk/samtools/SamIndexesTest.java @@ -41,6 +41,26 @@ public void testEmptyBai() throws IOException { } } + @Test + public void testEmptyCsi() throws IOException { + final File csiFile = File.createTempFile("test", ".csi"); + csiFile.deleteOnExit(); + final FileOutputStream fos = new FileOutputStream(csiFile); + fos.write(SamIndexes.CSI.magic); + fos.close(); + + + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + baos.write(SamIndexes.CSI.magic); + baos.close(); + + final InputStream inputStream = SamIndexes.asBaiStreamOrNull(new ByteArrayInputStream(baos.toByteArray()), null); + for (final byte b : SamIndexes.CSI.magic) { + Assert.assertEquals(inputStream.read(), 0xFF & b); + } + } + + @Test(expectedExceptions = NullPointerException.class) public void testCraiRequiresDictionary() throws IOException { final ByteArrayOutputStream baos = new ByteArrayOutputStream(); diff --git a/src/test/resources/htsjdk/samtools/BAMFileIndexTest/index_test.bam.csi b/src/test/resources/htsjdk/samtools/BAMFileIndexTest/index_test.bam.csi new file mode 100644 index 0000000000..93d1a10794 Binary files /dev/null and b/src/test/resources/htsjdk/samtools/BAMFileIndexTest/index_test.bam.csi differ diff --git a/src/test/resources/htsjdk/samtools/BAMFileIndexTest/index_test.bam.iii b/src/test/resources/htsjdk/samtools/BAMFileIndexTest/index_test.bam.iii new file mode 100644 index 0000000000..ddc25e661f Binary files /dev/null and b/src/test/resources/htsjdk/samtools/BAMFileIndexTest/index_test.bam.iii differ diff --git a/src/test/resources/htsjdk/samtools/BAMFileIndexTest/uncompressed_index.bam.bai b/src/test/resources/htsjdk/samtools/BAMFileIndexTest/uncompressed_index.bam.bai new file mode 100644 index 0000000000..f3834aa330 Binary files /dev/null and b/src/test/resources/htsjdk/samtools/BAMFileIndexTest/uncompressed_index.bam.bai differ diff --git a/src/test/resources/htsjdk/samtools/BAMFileIndexTest/uncompressed_index.bam.csi b/src/test/resources/htsjdk/samtools/BAMFileIndexTest/uncompressed_index.bam.csi new file mode 100644 index 0000000000..bf2afb003f Binary files /dev/null and b/src/test/resources/htsjdk/samtools/BAMFileIndexTest/uncompressed_index.bam.csi differ