Skip to content

Commit

Permalink
ready
Browse files Browse the repository at this point in the history
  • Loading branch information
takutosato committed Aug 29, 2023
1 parent 4299d3d commit bd6e979
Show file tree
Hide file tree
Showing 9 changed files with 63 additions and 76 deletions.
1 change: 1 addition & 0 deletions src/main/java/htsjdk/samtools/BAMFileReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
Expand Down
66 changes: 27 additions & 39 deletions src/main/java/htsjdk/samtools/BamFileIoUtils.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package htsjdk.samtools;

import htsjdk.beta.exception.HtsjdkException;
import htsjdk.samtools.cram.io.CountingInputStream;
import htsjdk.samtools.seekablestream.SeekablePathStream;
import htsjdk.samtools.util.BlockCompressedFilePointerUtil;
Expand Down Expand Up @@ -85,67 +86,54 @@ public static void blockCopyBamFile(final File inputFile, final OutputStream out
}

/**
* Copy data from a BAM file to an OutputStream by directly copying the gzip blocks
* Copy data from a BAM file to an OutputStream by directly copying the gzip blocks.
*
* @param inputFile The file to be copied
* @param inputFile The BAM file to be copied
* @param outputStream The stream to write the copied data to
* @param skipHeader If true, the header of the input file will not be copied to the output stream
* @param skipTerminator If true, the terminator block of the input file will not be written to the output stream
*/
public static void blockCopyBamFile(final Path inputFile, final OutputStream outputStream, final boolean skipHeader, final boolean skipTerminator) {
// tsato: use CountingInputStream to replace FileInputStream. The latter has .getChannel().getPosition
// The regular InputStream from Files.newInputStream() doesn't have it, but I might be able to do so by creating Channel object and avoid using CountingInputStream
try (final CountingInputStream in = new CountingInputStream(Files.newInputStream(inputFile));
final SeekablePathStream in2 = new SeekablePathStream(inputFile)){
// try (final InputStream in = Files.newInputStream(inputFile)){
try (final SeekablePathStream in = new SeekablePathStream(inputFile)){
// a) It's good to check that the end of the file is valid and b) we need to know if there's a terminator block and not copy it if skipTerminator is true
final BlockCompressedInputStream.FileTermination term = BlockCompressedInputStream.checkTermination(inputFile);
if (term == BlockCompressedInputStream.FileTermination.DEFECTIVE)
throw new SAMException(inputFile.toUri() + " does not have a valid GZIP block at the end of the file.");

if (skipHeader) { // tsato: this method assumes bam file...should I test cram?
// tsato: reason virtual offset of first read when the input is file doesn't need seekable stream is the blockcompressed one
// calls seekable. Then
final long vOffsetOfFirstRecord0 = SAMUtils.findVirtualOffsetOfFirstRecordInBam(inputFile.toFile()); // tsato: just for testing purposes
if (skipHeader) {
final long vOffsetOfFirstRecord = SAMUtils.findVirtualOffsetOfFirstRecordInBam(inputFile);
int d = 3;

final SeekablePathStream seekablePathStream = new SeekablePathStream(inputFile);
final BlockCompressedInputStream blockIn = new BlockCompressedInputStream(seekablePathStream); // Louis: Have BlockCompressedInputStream take a path?
blockIn.seek(vOffsetOfFirstRecord); // This is why we give the BlockCompressedInputStream a SeekablePathStream rather than the regular InputStream
final long remainingInBlock = blockIn.available(); // Louis is probably concerned that I'm passing the same seekablePathStream object

// If we found the end of the header then write the remainder of this block out as a
// new gzip block and then break out of the while loop
if (remainingInBlock >= 0) {
final BlockCompressedOutputStream blockOut = new BlockCompressedOutputStream(outputStream, (Path)null);
IOUtil.transferByStream(blockIn, blockOut, remainingInBlock);
blockOut.flush();
// Don't close blockOut because closing underlying stream would break everything
}

long pos = BlockCompressedFilePointerUtil.getBlockAddress(blockIn.getFilePointer()); // tsato: what exactly is a filepointer...
blockIn.close();

in2.seek(pos); // per Louis. But CountingInputStream cannot seek.
while (pos > 0) {
pos -= in.skip(pos);
// tsato: curious --- why do we need BlockCompressedInputStream at all here?
try (final BlockCompressedInputStream blockIn = new BlockCompressedInputStream(inputFile)) {
blockIn.seek(vOffsetOfFirstRecord);
final long remainingInBlock = blockIn.available();

// If we found the end of the header then write the remainder of this block out as a
// new gzip block and then break out of the while loop
if (remainingInBlock >= 0) {
final BlockCompressedOutputStream blockOut = new BlockCompressedOutputStream(outputStream, (Path) null);
IOUtil.transferByStream(blockIn, blockOut, remainingInBlock);
blockOut.flush();
// Don't close blockOut because closing underlying stream would break everything
}

final long pos = BlockCompressedFilePointerUtil.getBlockAddress(blockIn.getFilePointer());
blockIn.close();

in.seek(pos);
} catch (IOException e){
throw new HtsjdkException("Encountered an error ", e);
}
}

// Copy remainder of input stream into output stream
final long currentPos = in.getCount(); // tsato: assuming currentPos is the position after writing the first block, this should be ok.
final long currentPos2 = in2.position();
if (currentPos2 != currentPos){
int d = 3;
}
final long currentPos = in.position();
final long length = Files.size(inputFile);
final long skipLast = ((term == BlockCompressedInputStream.FileTermination.HAS_TERMINATOR_BLOCK) && skipTerminator) ?
BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length : 0;
final long bytesToWrite = length - skipLast - currentPos;

// tsato: test in2
IOUtil.transferByStream(in2, outputStream, bytesToWrite);
IOUtil.transferByStream(in, outputStream, bytesToWrite);
} catch (final IOException ioe) {
throw new RuntimeIOException(ioe);
}
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/htsjdk/samtools/SAMUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -679,9 +679,9 @@ public static int combineMapqs(int m1, int m2) {

}


public static long findVirtualOffsetOfFirstRecordInBam(final Path bamFile) {
try {
SeekableStream ss = new SeekablePathStream(bamFile);
try (SeekableStream ss = new SeekablePathStream(bamFile)){
return BAMFileReader.findVirtualOffsetOfFirstRecord(ss);
} catch (final IOException ioe) {
throw new RuntimeEOFException(ioe);
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/htsjdk/samtools/SamFileValidator.java
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ public boolean validateSamFileVerbose(final SamReader samReader, final Reference
}

public void validateBamFileTermination(final File inputFile) {
validateBamFileTermination(inputFile.toPath());
validateBamFileTermination(IOUtil.toPath(inputFile));
}

public void validateBamFileTermination(final Path inputFile) {
Expand All @@ -222,7 +222,7 @@ public void validateBamFileTermination(final Path inputFile) {
BlockCompressedInputStream.checkTermination(inputFile);
if (terminationState.equals(BlockCompressedInputStream.FileTermination.DEFECTIVE)) {
addError(new SAMValidationError(Type.TRUNCATED_FILE, "BAM file has defective last gzip block",
inputFile.toAbsolutePath().toString())); // tsato: confirm toAbsolutePath() is the right thing to do here
inputFile.toUri().toString()));
} else if (terminationState.equals(BlockCompressedInputStream.FileTermination.HAS_HEALTHY_LAST_BLOCK)) {
addError(new SAMValidationError(Type.BAM_FILE_MISSING_TERMINATOR_BLOCK,
"Older BAM file -- does not have terminator block",
Expand Down
17 changes: 14 additions & 3 deletions src/main/java/htsjdk/samtools/util/BlockCompressedInputStream.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import htsjdk.samtools.seekablestream.SeekableBufferedStream;
import htsjdk.samtools.seekablestream.SeekableFileStream;
import htsjdk.samtools.seekablestream.SeekableHTTPStream;
import htsjdk.samtools.seekablestream.SeekablePathStream;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.util.zip.InflaterFactory;

Expand Down Expand Up @@ -63,7 +64,7 @@ public class BlockCompressedInputStream extends InputStream implements LocationA

private InputStream mStream = null;
private boolean mIsClosed = false;
private SeekableStream mFile = null; // tsato: change name to mPath?
private SeekableStream mFile = null;
private byte[] mFileBuffer = null;
private DecompressedBlock mCurrentBlock = null;
private int mCurrentOffset = 0;
Expand Down Expand Up @@ -123,6 +124,15 @@ public BlockCompressedInputStream(final File file) throws IOException {
this(file, BlockGunzipper.getDefaultInflaterFactory());
}


/**
* Equivalent constructor for Path as the one that takes a File. Supports seeking.
*/
public BlockCompressedInputStream(final Path file) throws IOException {
this(new SeekablePathStream(file));
}


/**
* Use this ctor if you wish to call seek()
* @param file source of bytes
Expand All @@ -135,6 +145,7 @@ public BlockCompressedInputStream(final File file, final InflaterFactory inflate
blockGunzipper = new BlockGunzipper(inflaterFactory);
}


/**
* @param url source of bytes
*/
Expand Down Expand Up @@ -359,7 +370,7 @@ public void seek(final long pos) throws IOException {
}

// Cannot seek on streams that are not file based
if (mFile == null) { // tsato: mFile is a seekable stream---
if (mFile == null) {
throw new IOException(CANNOT_SEEK_STREAM_MSG);
}

Expand Down Expand Up @@ -706,7 +717,7 @@ static void readFully(SeekableByteChannel channel, ByteBuffer dst) throws IOExce

@Deprecated
public static void assertNonDefectiveFile(final File file) throws IOException {
assertNonDefectivePath(file.toPath());
assertNonDefectivePath(IOUtil.toPath(file));
}

public static void assertNonDefectivePath(final Path file) throws IOException {
Expand Down
44 changes: 15 additions & 29 deletions src/test/java/htsjdk/samtools/BamFileIoUtilsUnitTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
Expand All @@ -20,9 +19,8 @@
public class BamFileIoUtilsUnitTest extends HtsjdkTest {
final static String TEST_DATA_DIR = "src/test/resources/htsjdk/samtools/cram/";
final static String NA12878_8000 = TEST_DATA_DIR + "CEUTrio.HiSeq.WGS.b37.NA12878.20.first.8000.bam";
final static String NA12878_500 = TEST_DATA_DIR + "CEUTrio.HiSeq.WGS.b37.NA12878.20.first.500.bam";
final static String NA12878_20_21 = TEST_DATA_DIR + "NA12878.20.21.unmapped.orig.bam";
final static int DEFAULT_NUM_RECORDS_TO_COMPARE = 10;


@DataProvider(name="ReheaderBamFileTestInput")
public Object[][] getReheaderBamFileTestInput() { // tsato: is this the right naming scheme? e.g. get(method-name)Input
Expand All @@ -37,8 +35,8 @@ public Object[][] getReheaderBamFileTestInput() { // tsato: is this the right na

@Test(dataProvider = "ReheaderBamFileTestInput")
public void testReheaderBamFile(final boolean createMd5, final boolean createIndex) throws IOException {
final File originalBam = new File(NA12878_8000);
SAMFileHeader header = SamReaderFactory.make().getFileHeader(new File(NA12878_8000));
final File originalBam = new File(NA12878_500);
SAMFileHeader header = SamReaderFactory.make().getFileHeader(new File(NA12878_500));
header.addComment("This is a new, modified header");

final Path output = Files.createTempFile("output", ".bam");
Expand All @@ -48,8 +46,9 @@ public void testReheaderBamFile(final boolean createMd5, final boolean createInd
final SamReader outputReader = SamReaderFactory.make().open(output);
Assert.assertEquals(outputReader.getFileHeader(), header);

// Confirm that the reads are the same as the original
Assert.assertTrue(compareBamReads(originalBam, output.toFile(), DEFAULT_NUM_RECORDS_TO_COMPARE)); // tsato: toFile a stop gap
// Check that the reads are the same as the original
// tsato: should I be using something similar to IOUtil.toPath for converting Path -> File to propagate null?
assertBamRecordsEqual(originalBam, output.toFile());

if (createMd5){
Assert.assertTrue(Files.exists(output.resolveSibling(output.getFileName() + FileExtensions.MD5)));
Expand All @@ -64,30 +63,18 @@ public void testReheaderBamFile(final boolean createMd5, final boolean createInd
* Compare first (numRecordsToCompare) reads of two bam files.
* We do not check for equality of the headers.
*
* @param numRecordsToCompare the number of reads to compare
*
* @return true if the first (numRecordsToCompare) reads are equal, else false
*/
private boolean compareBamReads(final File bam1, final File bam2, final int numRecordsToCompare){
SamReader reader = SamReaderFactory.make().open(bam1.toPath());
final Iterator<SAMRecord> originalBamIterator = SamReaderFactory.make().open(bam1.toPath()).iterator();
final Iterator<SAMRecord> outputBamIterator = SamReaderFactory.make().open(bam2.toPath()).iterator();

// tsato: per Louis...somehow failing the tests
boolean louis = false;
if (louis){
private void assertBamRecordsEqual(final File bam1, final File bam2){
try (SamReader reader1 = SamReaderFactory.make().open(bam1);
SamReader reader2 = SamReaderFactory.make().open(bam2)) {
final Iterator<SAMRecord> originalBamIterator = reader1.iterator();
final Iterator<SAMRecord> outputBamIterator = reader2.iterator();

Assert.assertEquals(originalBamIterator, outputBamIterator);
} catch (Exception e){
throw new HtsjdkException("Encountered an error reading bam files: " + bam1.getAbsolutePath() + " and " + bam2.getAbsolutePath(), e);
}
for (int i = 0; i < numRecordsToCompare; i++){
final SAMRecord originalRead = originalBamIterator.next();
final SAMRecord outputRead = outputBamIterator.next();
if (! originalRead.equals(outputRead)){
return false;
}
}


return true;
}

@DataProvider(name="BlockCopyBamFileTestInput")
Expand All @@ -102,7 +89,6 @@ public Object[][] getBlockCopyBamFileTestInput() {

@Test(dataProvider = "BlockCopyBamFileTestInput")
public void testBlockCopyBamFile(final boolean skipHeader, final boolean skipTerminator) throws IOException {
System.out.println(skipHeader + ", " + skipTerminator);
final File output = File.createTempFile("output", ".bam");
try (final OutputStream out = Files.newOutputStream(output.toPath())) {
final Path input = Paths.get(NA12878_8000);
Expand All @@ -118,7 +104,7 @@ public void testBlockCopyBamFile(final boolean skipHeader, final boolean skipTer
} else {
Assert.assertEquals(outputReader.getFileHeader(), inputReader.getFileHeader());
// Reading will fail when the header is absent
Assert.assertTrue(compareBamReads(input.toFile(), output, DEFAULT_NUM_RECORDS_TO_COMPARE));
assertBamRecordsEqual(input.toFile(), output);
}

if (skipTerminator) {
Expand Down
3 changes: 2 additions & 1 deletion src/test/java/htsjdk/samtools/HtsjdkTestUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
public class HtsjdkTestUtils {
// tsato: change to HtsPath...
public static final String TEST_DATA_DIR = "src/test/resources/htsjdk/samtools/cram/";
// tsato: silly to use HtsPath when I know it can't handle e.g. gcloud files?
public static final HtsPath NA12878_8000 = new HtsPath(TEST_DATA_DIR + "CEUTrio.HiSeq.WGS.b37.NA12878.20.first.8000.bam");
public static final String NA12878_20_21 = TEST_DATA_DIR + "NA12878.20.21.unmapped.orig.bam";
public static final HtsPath NA12878_20_21 = new HtsPath(TEST_DATA_DIR + "NA12878.20.21.unmapped.orig.bam");
}
Binary file not shown.
Binary file not shown.

0 comments on commit bd6e979

Please sign in to comment.