diff --git a/pom.xml b/pom.xml index 74f8152..cba61ab 100644 --- a/pom.xml +++ b/pom.xml @@ -32,7 +32,7 @@ 1.8 1.8 - 2.16.1 + 2.18.2 2.7.5 2.22.0 0.42 diff --git a/src/htsjdk/java/htsjdk/samtools/BAMSBIIndexer.java b/src/htsjdk/java/htsjdk/samtools/BAMSBIIndexer.java deleted file mode 100644 index 32c0cf2..0000000 --- a/src/htsjdk/java/htsjdk/samtools/BAMSBIIndexer.java +++ /dev/null @@ -1,82 +0,0 @@ -package htsjdk.samtools; - -import htsjdk.samtools.cram.io.InputStreamUtils; -import htsjdk.samtools.seekablestream.SeekablePathStream; -import htsjdk.samtools.seekablestream.SeekableStream; -import htsjdk.samtools.util.BlockCompressedInputStream; -import htsjdk.samtools.util.IOUtil; -import htsjdk.samtools.util.RuntimeEOFException; -import java.io.EOFException; -import java.io.IOException; -import java.io.OutputStream; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.file.Files; -import java.nio.file.Path; - -// TODO: remove this class once https://github.com/samtools/htsjdk/pull/1138 is released - -/** Writes SBI files for BAM files, as understood by {@link SBIIndex}. */ -public final class BAMSBIIndexer { - - /** - * Perform indexing on the given BAM file, at the granularity level specified. - * - * @param bamFile the path to the BAM file - * @param granularity write the offset of every n-th alignment to the index - * @throws IOException as per java IO contract - */ - public static void createIndex(final Path bamFile, final long granularity) throws IOException { - Path splittingBaiFile = IOUtil.addExtension(bamFile, SBIIndex.FILE_EXTENSION); - try (SeekableStream in = new SeekablePathStream(bamFile); - OutputStream out = Files.newOutputStream(splittingBaiFile)) { - createIndex(in, out, granularity); - } - } - - /** - * Perform indexing on the given BAM file, at the granularity level specified. - * - * @param in a seekable stream for reading the BAM file from - * @param out the stream to write the index to - * @param granularity write the offset of every n-th alignment to the index - * @throws IOException as per java IO contract - */ - public static void createIndex( - final SeekableStream in, final OutputStream out, final long granularity) throws IOException { - long recordStart = findVirtualOffsetOfFirstRecordInBam(in); - try (BlockCompressedInputStream blockIn = new BlockCompressedInputStream(in)) { - blockIn.seek(recordStart); - final ByteBuffer byteBuffer = - ByteBuffer.allocate(4).order(ByteOrder.LITTLE_ENDIAN); // BAM is little-endian - SBIIndexWriter indexWriter = new SBIIndexWriter(out, granularity); - while (true) { - try { - recordStart = blockIn.getFilePointer(); - InputStreamUtils.readFully(blockIn, byteBuffer.array(), 0, 4); - final int blockSize = byteBuffer.getInt(0); // length of remainder of alignment record - indexWriter.processRecord(recordStart); - InputStreamUtils.skipFully(blockIn, blockSize); - } catch (EOFException e) { - break; - } - } - indexWriter.finish(recordStart, in.length()); - } - } - - /** - * Returns the virtual file offset of the first record in a BAM file - i.e. the virtual file - * offset after skipping over the text header and the sequence records. - * - * @param seekableStream BAM file - * @return the virtual file offset of the first record in the specified BAM file - */ - public static long findVirtualOffsetOfFirstRecordInBam(final SeekableStream seekableStream) { - try { - return BAMFileReader2.findVirtualOffsetOfFirstRecord(seekableStream); - } catch (final IOException ioe) { - throw new RuntimeEOFException(ioe); - } - } -} diff --git a/src/htsjdk/java/htsjdk/samtools/SBIIndex.java b/src/htsjdk/java/htsjdk/samtools/SBIIndex.java deleted file mode 100644 index da002ea..0000000 --- a/src/htsjdk/java/htsjdk/samtools/SBIIndex.java +++ /dev/null @@ -1,341 +0,0 @@ -package htsjdk.samtools; - -import htsjdk.samtools.util.BinaryCodec; -import htsjdk.samtools.util.BlockCompressedFilePointerUtil; -import java.io.BufferedInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.Serializable; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Objects; - -// TODO: remove this class once https://github.com/samtools/htsjdk/pull/1138 is released - -/** - * SBI is an index into BGZF-compressed data files, which has an entry for the file position of the - * start of every nth record. Reads files that were created by {@link BAMSBIIndexer}. - */ -public final class SBIIndex implements Serializable { - - public static class Header implements Serializable { - private final long fileLength; - private final byte[] md5; - private final byte[] uuid; - private final long totalNumberOfRecords; - private final long granularity; - - public Header( - long fileLength, byte[] md5, byte[] uuid, long totalNumberOfRecords, long granularity) { - this.fileLength = fileLength; - this.md5 = md5; - this.uuid = uuid; - this.totalNumberOfRecords = totalNumberOfRecords; - this.granularity = granularity; - } - - public long getFileLength() { - return fileLength; - } - - public byte[] getMd5() { - return md5; - } - - public byte[] getUuid() { - return uuid; - } - - public long getTotalNumberOfRecords() { - return totalNumberOfRecords; - } - - public long getGranularity() { - return granularity; - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - Header header = (Header) o; - return fileLength == header.fileLength - && totalNumberOfRecords == header.totalNumberOfRecords - && granularity == header.granularity - && Arrays.equals(md5, header.md5) - && Arrays.equals(uuid, header.uuid); - } - - @Override - public int hashCode() { - int result = Objects.hash(fileLength, totalNumberOfRecords, granularity); - result = 31 * result + Arrays.hashCode(md5); - result = 31 * result + Arrays.hashCode(uuid); - return result; - } - - @Override - public String toString() { - return "Header{" - + "fileLength=" - + fileLength - + ", md5=" - + Arrays.toString(md5) - + ", uuid=" - + Arrays.toString(uuid) - + ", totalNumberOfRecords=" - + totalNumberOfRecords - + ", granularity=" - + granularity - + '}'; - } - } - - public static final String FILE_EXTENSION = ".sbi"; - - /** SBI magic number. */ - static final byte[] SBI_MAGIC = "SBI\1".getBytes(); - - private final Header header; - private final long[] virtualOffsets; - - /** - * Create an in-memory SBI with the given virtual offsets. - * - * @param header header - * @param virtualOffsets the offsets in the index - */ - public SBIIndex(final Header header, final long[] virtualOffsets) { - this.header = header; - this.virtualOffsets = virtualOffsets; - if (this.virtualOffsets.length == 0) { - throw new RuntimeException("Invalid SBI format: should contain at least one offset"); - } - } - - /** - * Load an SBI into memory from a path. - * - * @param path the path to the SBI file - * @return SBI index - * @throws IOException as per java IO contract - */ - public static SBIIndex load(final Path path) throws IOException { - try (InputStream in = new BufferedInputStream(Files.newInputStream(path))) { - return readIndex(in); - } - } - - /** - * Load an SBI into memory from a stream. - * - * @param in the stream to read the SBI from - * @return SBI index - */ - public static SBIIndex load(final InputStream in) { - return readIndex(in); - } - - private static SBIIndex readIndex(final InputStream in) { - BinaryCodec binaryCodec = new BinaryCodec(in); - Header header = readHeader(binaryCodec); - long numOffsetsLong = binaryCodec.readLong(); - if (numOffsetsLong > Integer.MAX_VALUE) { - throw new RuntimeException( - String.format("Cannot read SBI with more than %s offsets.", Integer.MAX_VALUE)); - } - int numOffsets = (int) numOffsetsLong; - long[] virtualOffsets = new long[numOffsets]; - long prev = -1; - for (int i = 0; i < numOffsets; i++) { - long cur = binaryCodec.readLong(); - if (prev > cur) { - throw new RuntimeException( - String.format("Invalid SBI; offsets not in order: %#x > %#x", prev, cur)); - } - virtualOffsets[i] = cur; - prev = cur; - } - return new SBIIndex(header, virtualOffsets); - } - - private static Header readHeader(BinaryCodec binaryCodec) { - final byte[] buffer = new byte[SBI_MAGIC.length]; - binaryCodec.readBytes(buffer); - if (!Arrays.equals(buffer, SBI_MAGIC)) { - throw new RuntimeException( - "Invalid file header in SBI: " - + new String(buffer) - + " (" - + Arrays.toString(buffer) - + ")"); - } - long fileLength = binaryCodec.readLong(); - byte[] md5 = new byte[16]; - binaryCodec.readBytes(md5); - byte[] uuid = new byte[16]; - binaryCodec.readBytes(uuid); - long totalNumberOfRecords = binaryCodec.readLong(); - long granularity = binaryCodec.readLong(); - return new Header(fileLength, md5, uuid, totalNumberOfRecords, granularity); - } - - /** - * Returns the index header. - * - * @return the header - */ - public Header getHeader() { - return header; - } - - /** - * Returns the granularity of the index, that is the number of alignments between subsequent - * entries in the index, or zero if not specified. - * - * @return the granularity of the index - */ - public long getGranularity() { - return header.getGranularity(); - } - - /** - * Returns the entries in the index. - * - * @return an array of file pointers for all the alignment offsets in the index, in ascending - * order. The last virtual file pointer is the position at which the next record would start - * if it were added to the file. - */ - public long[] getVirtualOffsets() { - return virtualOffsets; - } - - /** - * Returns number of entries in the index. - * - * @return the number of virtual offsets in the index - */ - public long size() { - return virtualOffsets.length; - } - - /** - * Returns the length of the data file in bytes. - * - * @return the length of the data file in bytes - */ - public long dataFileLength() { - return header.getFileLength(); - } - - /** - * Split the data file for this index into non-overlapping chunks of roughly the given size that - * cover the whole file and that can be read independently of one another. - * - * @param splitSize the rough size of each split in bytes - * @return a list of contiguous, non-overlapping, sorted chunks that cover the whole data file - * @see #getChunk(long, long) - */ - public List split(long splitSize) { - if (splitSize <= 0) { - throw new IllegalArgumentException( - String.format("Split size must be positive: %s", splitSize)); - } - long fileSize = dataFileLength(); - List chunks = new ArrayList<>(); - for (long splitStart = 0; splitStart < fileSize; splitStart += splitSize) { - Chunk chunk = getChunk(splitStart, splitStart + splitSize); - if (chunk != null) { - chunks.add(chunk); - } - } - return chunks; - } - - /** - * Return a chunk that corresponds to the given range in the data file. Note that the chunk does - * not necessarily completely cover the given range, however this method will map a set of - * contiguous, non-overlapping file ranges that cover the whole data file to a set of contiguous, - * non-overlapping chunks that cover the whole data file. - * - * @param splitStart the start of the file range (inclusive) - * @param splitEnd the start of the file range (exclusive) - * @return a chunk whose virtual start is at the first alignment start position that is greater - * than or equal to the given split start position, and whose virtual end is at the first - * alignment start position that is greater than or equal to the given split end position, or - * null if the chunk would be empty. - * @see #split(long) - */ - public Chunk getChunk(long splitStart, long splitEnd) { - if (splitStart >= splitEnd) { - throw new IllegalArgumentException( - String.format("Split start (%s) must be less than end (%s)", splitStart, splitEnd)); - } - long lastVirtualOffset = virtualOffsets[virtualOffsets.length - 1]; - long maxEnd = BlockCompressedFilePointerUtil.getBlockAddress(lastVirtualOffset); - splitStart = Math.min(splitStart, maxEnd); - splitEnd = Math.min(splitEnd, maxEnd); - long virtualSplitStart = BlockCompressedFilePointerUtil.makeFilePointer(splitStart); - long virtualSplitEnd = BlockCompressedFilePointerUtil.makeFilePointer(splitEnd); - long virtualSplitStartAlignment = ceiling(virtualSplitStart); - long virtualSplitEndAlignment = ceiling(virtualSplitEnd); - if (virtualSplitStartAlignment == virtualSplitEndAlignment) { - return null; - } - return new Chunk(virtualSplitStartAlignment, virtualSplitEndAlignment); - } - - private long ceiling(long virtualOffset) { - int index = Arrays.binarySearch(virtualOffsets, virtualOffset); - if (index < 0) { - index = -index - 1; - if (index == virtualOffsets.length) { - long lastVirtualOffset = virtualOffsets[virtualOffsets.length - 1]; - throw new IllegalArgumentException( - String.format( - "No virtual offset found for virtual file pointer %s, last virtual offset %s", - BlockCompressedFilePointerUtil.asString(virtualOffset), - BlockCompressedFilePointerUtil.asString(lastVirtualOffset))); - } - } - return virtualOffsets[index]; - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - SBIIndex sbiIndex = (SBIIndex) o; - return Objects.equals(header, sbiIndex.header) - && Arrays.equals(virtualOffsets, sbiIndex.virtualOffsets); - } - - @Override - public int hashCode() { - int result = Objects.hash(header); - result = 31 * result + Arrays.hashCode(virtualOffsets); - return result; - } - - @Override - public String toString() { - String virtualOffsetsString; - if (virtualOffsets.length > 30) { - virtualOffsetsString = - Arrays.toString(Arrays.copyOfRange(virtualOffsets, 0, 30)).replace("]", ", ...]"); - } else { - virtualOffsetsString = Arrays.toString(virtualOffsets); - } - return "SBIIndex{" - + "header=" - + header - + ", numVirtualOffsets=" - + virtualOffsets.length - + ", virtualOffsets=" - + virtualOffsetsString - + '}'; - } -} diff --git a/src/htsjdk/java/htsjdk/samtools/SBIIndexWriter.java b/src/htsjdk/java/htsjdk/samtools/SBIIndexWriter.java deleted file mode 100644 index c4ccc5c..0000000 --- a/src/htsjdk/java/htsjdk/samtools/SBIIndexWriter.java +++ /dev/null @@ -1,163 +0,0 @@ -package htsjdk.samtools; - -import htsjdk.samtools.util.BinaryCodec; -import htsjdk.samtools.util.IOUtil; -import htsjdk.samtools.util.RuntimeIOException; -import java.io.BufferedInputStream; -import java.io.BufferedOutputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; - -// TODO: remove this class once https://github.com/samtools/htsjdk/pull/1138 is released - -/** - * Writes SBI files as understood by {@link SBIIndex}. - * - *

To use this class, first construct an instance from an output stream, and a desired - * granularity. Then for each record in the file being indexed, pass the virtual file offset of the - * record to the {@link #processRecord} method. The indexer will keep a count of the records passed - * in an index every nth record. When there are no records left call {@link #finish} to - * complete writing the index. - */ -public final class SBIIndexWriter { - - // Default to a granularity level of 4096. This is generally sufficient - // for very large BAM files, relative to a maximum heap size in the - // gigabyte range. - public static final long DEFAULT_GRANULARITY = 4096; - - static final byte[] EMPTY_MD5 = new byte[16]; - static final byte[] EMPTY_UUID = new byte[16]; - - private final OutputStream out; - private final long granularity; - private final File tempOffsetsFile; - private final BinaryCodec tempOffsetsCodec; - private long prev = -1; - private long recordCount; - private long virtualOffsetCount; - - /** - * Prepare to write an SBI index with the default granularity. - * - * @param out the stream to write the index to - */ - public SBIIndexWriter(final OutputStream out) { - this(out, SBIIndexWriter.DEFAULT_GRANULARITY); - } - - /** - * Prepare to write an SBI index. - * - * @param out the stream to write the index to - * @param granularity write the offset of every nth record to the index - */ - public SBIIndexWriter(final OutputStream out, final long granularity) { - this.out = out; - this.granularity = granularity; - try { - // Write the offsets to a temporary file, then write the entire file contents to the output - // stream at - // the end, once we know the number of offsets. This is more efficient than using a List - // for very - // large numbers of offsets (e.g. 10^8, which is possible for low granularity), since the list - // resizing - // operation is slow. - this.tempOffsetsFile = File.createTempFile("offsets-", ".headerless.sbi"); - this.tempOffsetsCodec = - new BinaryCodec(new BufferedOutputStream(new FileOutputStream(tempOffsetsFile))); - } catch (IOException e) { - throw new RuntimeIOException(e); - } - } - - /** - * Process a record for the index: the offset of every nth record will be written to the - * index. - * - * @param virtualOffset virtual file pointer of the record - */ - public void processRecord(final long virtualOffset) { - if (recordCount++ % granularity == 0) { - writeVirtualOffset(virtualOffset); - } - } - - void writeVirtualOffset(long virtualOffset) { - if (prev > virtualOffset) { - throw new IllegalArgumentException( - String.format("Offsets not in order: %#x > %#x", prev, virtualOffset)); - } - tempOffsetsCodec.writeLong(virtualOffset); - virtualOffsetCount++; - prev = virtualOffset; - } - - /** - * Complete the index, and close the output stream. - * - * @param finalVirtualOffset the virtual offset at which the next record would start if it were - * added to the file - * @param dataFileLength the length of the data file in bytes - */ - public void finish(long finalVirtualOffset, long dataFileLength) { - finish(finalVirtualOffset, dataFileLength, null, null); - } - - /** - * Complete the index, and close the output stream. - * - * @param finalVirtualOffset the virtual offset at which the next record would start if it were - * added to the file - * @param dataFileLength the length of the data file in bytes - * @param md5 the MD5 hash of the data file, or null if not specified - * @param uuid the UUID for the data file, or null if not specified - */ - private void finish(long finalVirtualOffset, long dataFileLength, byte[] md5, byte[] uuid) { - if (md5 != null && md5.length != 16) { - throw new IllegalArgumentException("Invalid MD5 length: " + md5.length); - } - if (uuid != null && uuid.length != 16) { - throw new IllegalArgumentException("Invalid UUID length: " + uuid.length); - } - SBIIndex.Header header = - new SBIIndex.Header( - dataFileLength, - md5 == null ? EMPTY_MD5 : md5, - uuid == null ? EMPTY_UUID : uuid, - recordCount, - granularity); - finish(header, finalVirtualOffset); - } - - void finish(SBIIndex.Header header, long finalVirtualOffset) { - // complete writing the temp offsets file - writeVirtualOffset(finalVirtualOffset); - tempOffsetsCodec.close(); - - try (BinaryCodec binaryCodec = new BinaryCodec(out); - InputStream tempOffsets = new BufferedInputStream(new FileInputStream(tempOffsetsFile))) { - writeHeader(header, binaryCodec); - IOUtil.copyStream(tempOffsets, out); - } catch (IOException e) { - throw new RuntimeIOException(e); - } finally { - tempOffsetsFile.delete(); - } - } - - private void writeHeader(SBIIndex.Header header, BinaryCodec binaryCodec) { - binaryCodec.writeBytes(SBIIndex.SBI_MAGIC); - binaryCodec.writeLong(header.getFileLength()); - binaryCodec.writeBytes(header.getMd5()); - binaryCodec.writeBytes(header.getUuid()); - binaryCodec.writeLong(header.getTotalNumberOfRecords()); - binaryCodec.writeLong(header.getGranularity()); - binaryCodec.writeLong(virtualOffsetCount); - } - -}