Skip to content

Commit

Permalink
testing around
Browse files Browse the repository at this point in the history
  • Loading branch information
takutosato committed Aug 29, 2023
1 parent 583e0f3 commit 4299d3d
Show file tree
Hide file tree
Showing 7 changed files with 147 additions and 61 deletions.
4 changes: 2 additions & 2 deletions src/main/java/htsjdk/samtools/BAMFileReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -297,8 +297,8 @@ private BAMFileReader(final BlockCompressedInputStream compressedInputStream,
this.eagerDecode = eagerDecode;
this.mValidationStringency = validationStringency;
this.samRecordFactory = samRecordFactory;
this.mFileHeader = readHeader(this.mStream, this.mValidationStringency, source);
mFirstRecordPointer = mCompressedInputStream.getFilePointer();
this.mFileHeader = readHeader(this.mStream, this.mValidationStringency, source); // tsato: move the "pointer" to the position of first read
mFirstRecordPointer = mCompressedInputStream.getFilePointer(); // tsato: where do we "seek"? Why did we get away without seekableStream before?
}

/**
Expand Down
57 changes: 32 additions & 25 deletions src/main/java/htsjdk/samtools/BamFileIoUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,12 @@
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.Md5CalculatingOutputStream;
import htsjdk.samtools.util.RuntimeIOException;
import htsjdk.utils.ValidationUtils;
import org.apache.commons.compress.utils.FileNameUtils;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
Expand Down Expand Up @@ -46,9 +45,8 @@ public static void reheaderBamFile(final SAMFileHeader samFileHeader, final Path
/**
* Support File input types for backward compatibility. Use the same method with Path inputs below.
*/
@Deprecated
public static void reheaderBamFile(final SAMFileHeader samFileHeader, final File inputFile, final File outputFile, final boolean createMd5, final boolean createIndex) {
reheaderBamFile(samFileHeader, inputFile.toPath(), outputFile.toPath(), createMd5, createIndex);
reheaderBamFile(samFileHeader, IOUtil.toPath(inputFile), IOUtil.toPath(outputFile), createMd5, createIndex);
}

/**
Expand All @@ -61,8 +59,10 @@ public static void reheaderBamFile(final SAMFileHeader samFileHeader, final File
* @param createIndex Whether or not to create an index file for the new BAM
*/
public static void reheaderBamFile(final SAMFileHeader samFileHeader, final Path inputFile, final Path outputFile, final boolean createMd5, final boolean createIndex) {
ValidationUtils.nonNull(inputFile);
ValidationUtils.nonNull(outputFile);
IOUtil.assertFileIsReadable(inputFile);
// IOUtil.assertFileIsWritable(outputFile); // tsato: what do I do with this...
IOUtil.assertFileIsWritable(inputFile);

try {
BlockCompressedInputStream.assertNonDefectivePath(inputFile);
Expand All @@ -80,12 +80,8 @@ public static void reheaderBamFile(final SAMFileHeader samFileHeader, final Path
}
}

/**
* @deprecated as of August 2023. Use the method by the same name below with Path input
*/
@Deprecated
public static void blockCopyBamFile(final File inputFile, final OutputStream outputStream, final boolean skipHeader, final boolean skipTerminator) {
blockCopyBamFile(inputFile.toPath(), outputStream, skipHeader, skipTerminator);
blockCopyBamFile(IOUtil.toPath(inputFile), outputStream, skipHeader, skipTerminator);
}

/**
Expand All @@ -99,19 +95,25 @@ public static void blockCopyBamFile(final File inputFile, final OutputStream out
public static void blockCopyBamFile(final Path inputFile, final OutputStream outputStream, final boolean skipHeader, final boolean skipTerminator) {
// tsato: use CountingInputStream to replace FileInputStream. The latter has .getChannel().getPosition
// The regular InputStream from Files.newInputStream() doesn't have it, but I might be able to do so by creating Channel object and avoid using CountingInputStream
try (final CountingInputStream in = new CountingInputStream(Files.newInputStream(inputFile))){
try (final CountingInputStream in = new CountingInputStream(Files.newInputStream(inputFile));
final SeekablePathStream in2 = new SeekablePathStream(inputFile)){
// try (final InputStream in = Files.newInputStream(inputFile)){
// a) It's good to check that the end of the file is valid and b) we need to know if there's a terminator block and not copy it if skipTerminator is true
final BlockCompressedInputStream.FileTermination term = BlockCompressedInputStream.checkTermination(inputFile);
if (term == BlockCompressedInputStream.FileTermination.DEFECTIVE)
throw new SAMException(inputFile.toUri() + " does not have a valid GZIP block at the end of the file.");

if (skipHeader) { // tsato: this method assumes bam file...should I test cram?
// tsato: reason virtual offset of first read when the input is file doesn't need seekable stream is the blockcompressed one
// calls seekable. Then
final long vOffsetOfFirstRecord0 = SAMUtils.findVirtualOffsetOfFirstRecordInBam(inputFile.toFile()); // tsato: just for testing purposes
final long vOffsetOfFirstRecord = SAMUtils.findVirtualOffsetOfFirstRecordInBam(inputFile);
int d = 3;

final SeekablePathStream seekablePathStream = new SeekablePathStream(inputFile);
final long vOffsetOfFirstRecord = SAMUtils.findVirtualOffsetOfFirstRecordInBam(seekablePathStream);
final BlockCompressedInputStream blockIn = new BlockCompressedInputStream(seekablePathStream);
final BlockCompressedInputStream blockIn = new BlockCompressedInputStream(seekablePathStream); // Louis: Have BlockCompressedInputStream take a path?
blockIn.seek(vOffsetOfFirstRecord); // This is why we give the BlockCompressedInputStream a SeekablePathStream rather than the regular InputStream
final long remainingInBlock = blockIn.available();
final long remainingInBlock = blockIn.available(); // Louis is probably concerned that I'm passing the same seekablePathStream object

// If we found the end of the header then write the remainder of this block out as a
// new gzip block and then break out of the while loop
Expand All @@ -122,21 +124,28 @@ public static void blockCopyBamFile(final Path inputFile, final OutputStream out
// Don't close blockOut because closing underlying stream would break everything
}

long pos = BlockCompressedFilePointerUtil.getBlockAddress(blockIn.getFilePointer());
long pos = BlockCompressedFilePointerUtil.getBlockAddress(blockIn.getFilePointer()); // tsato: what exactly is a filepointer...
blockIn.close();

in2.seek(pos); // per Louis. But CountingInputStream cannot seek.
while (pos > 0) {
pos -= in.skip(pos);
}
}

// Copy remainder of input stream into output stream
final long currentPos = in.getCount(); // tsato: assuming currentPos is the position after writing the first block, this should be ok.
// final long length = inputFile.length();
final long length = Files.size(inputFile); // tsato: rename to size
final long currentPos2 = in2.position();
if (currentPos2 != currentPos){
int d = 3;
}
final long length = Files.size(inputFile);
final long skipLast = ((term == BlockCompressedInputStream.FileTermination.HAS_TERMINATOR_BLOCK) && skipTerminator) ?
BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length : 0;
final long bytesToWrite = length - skipLast - currentPos;

IOUtil.transferByStream(in, outputStream, bytesToWrite); // tsato: this method is manually buffered so make sure BufferedReader/Writer is not used (or does that matter?)
// tsato: test in2
IOUtil.transferByStream(in2, outputStream, bytesToWrite);
} catch (final IOException ioe) {
throw new RuntimeIOException(ioe);
}
Expand All @@ -162,7 +171,7 @@ public static void gatherWithBlockCopying(final List<File> bams, final File outp

for (final File f : bams) {
LOG.info(String.format("Block copying %s ...", f.getAbsolutePath()));
blockCopyBamFile(f.toPath(), out, !isFirstFile, true);
blockCopyBamFile(IOUtil.toPath(f), out, !isFirstFile, true);
isFirstFile = false;
}

Expand All @@ -183,27 +192,25 @@ public static void gatherWithBlockCopying(final List<File> bams, final File outp
}
}

@Deprecated
private static OutputStream buildOutputStream(final File outputFile, final boolean createMd5, final boolean createIndex) throws IOException {
return buildOutputStream(outputFile.toPath(), createMd5, createIndex);
return buildOutputStream(IOUtil.toPath(outputFile), createMd5, createIndex);
}

private static OutputStream buildOutputStream(final Path outputFile, final boolean createMd5, final boolean createIndex) throws IOException {
OutputStream outputStream = Files.newOutputStream(outputFile);
if (createMd5) {
outputStream = new Md5CalculatingOutputStream(outputStream, Paths.get(outputFile + ".md")); // tsato: is this right? maybe need .getURI()
outputStream = new Md5CalculatingOutputStream(outputStream, IOUtil.addExtension(outputFile, FileExtensions.MD5));
}
if (createIndex) {
final String baseName = FileNameUtils.getBaseName(outputFile);
outputStream = new StreamInflatingIndexingOutputStream(outputStream, Paths.get(baseName + FileExtensions.BAI_INDEX));
outputStream = new StreamInflatingIndexingOutputStream(outputStream, outputFile.resolveSibling(outputFile.getFileName() + FileExtensions.BAI_INDEX));
}
return outputStream;
}


@Deprecated
private static void assertSortOrdersAreEqual(final SAMFileHeader newHeader, final File inputFile) throws IOException {
assertSortOrdersAreEqual(newHeader, inputFile.toPath());
assertSortOrdersAreEqual(newHeader, IOUtil.toPath(inputFile));
}

private static void assertSortOrdersAreEqual(final SAMFileHeader newHeader, final Path inputFile) throws IOException {
Expand Down
1 change: 1 addition & 0 deletions src/main/java/htsjdk/samtools/util/FileExtensions.java
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ public final class FileExtensions {
public static final String GZI = ".gzi";
public static final String SBI = ".sbi";
public static final String CSI = ".csi";
public static final String MD5 = ".md5";

public static final Set<String> BLOCK_COMPRESSED = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(".gz", ".gzip", ".bgz", ".bgzf")));

Expand Down
18 changes: 17 additions & 1 deletion src/main/java/htsjdk/samtools/util/IOUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,21 @@ public static void assertFilesAreWritable(final List<File> files) {
for (final File file : files) assertFileIsWritable(file);
}


/**
* In some filesystems (e.g. google cloud) it may not make sense to check writability.
* This method only checks writability when it makes sense to (i.e. for now when the path points to a file
* in the local filesystem)
*
* @param path
*/
// tsato: naming this a bit differently to make clear that it works a bit differently from "assertFileIsWritable"
public static void assertFileIsWritable(final Path path){ // tsato: perhaps IOPath
if (path.toUri().getScheme().equals("file")){
IOUtil.assertFileIsWritable(path.toFile());
}
}

/**
* Checks that a directory is non-null, extent, writable and a directory
* otherwise a runtime exception is thrown.
Expand Down Expand Up @@ -867,7 +882,8 @@ public static OutputStream openFileForMd5CalculatingWriting(final File file) {
}

public static OutputStream openFileForMd5CalculatingWriting(final Path file) {
return new Md5CalculatingOutputStream(IOUtil.openFileForWriting(file), file.resolve(".md5"));
final Path digestFile = file.resolveSibling(file.getFileName() + FileExtensions.MD5);
return new Md5CalculatingOutputStream(IOUtil.openFileForWriting(file), digestFile);
}

/**
Expand Down
95 changes: 62 additions & 33 deletions src/test/java/htsjdk/samtools/BamFileIoUtilsUnitTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import htsjdk.HtsjdkTest;
import htsjdk.beta.exception.HtsjdkException;
import htsjdk.samtools.util.BlockCompressedInputStream;
import htsjdk.samtools.util.FileExtensions;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
Expand All @@ -22,47 +23,73 @@ public class BamFileIoUtilsUnitTest extends HtsjdkTest {
final static String NA12878_20_21 = TEST_DATA_DIR + "NA12878.20.21.unmapped.orig.bam";
final static int DEFAULT_NUM_RECORDS_TO_COMPARE = 10;

@Test
public void testReheaderBamFile(){
final File originalBam = new File(this.NA12878_8000);
SAMFileHeader header = SamReaderFactory.make().getFileHeader(new File(NA12878_20_21));
try {
final File output = File.createTempFile("output", ".bam");
BamFileIoUtils.reheaderBamFile(header, originalBam.toPath(), output.toPath());

// Confirm that the header has been replaced
final SamReader outputReader = SamReaderFactory.make().open(output.toPath());
Assert.assertEquals(outputReader.getFileHeader(), header);
@DataProvider(name="ReheaderBamFileTestInput")
public Object[][] getReheaderBamFileTestInput() { // tsato: is this the right naming scheme? e.g. get(method-name)Input
// tsato: extract this as a method that takes a number of arguments and returns 2^n elements
return new Object[][] {
{true, true},
{true, false},
{false, true},
{false, false}
};
}

// Confirm that the reads are the same as the original
Assert.assertTrue(compareBamReads(originalBam, output, DEFAULT_NUM_RECORDS_TO_COMPARE));
} catch (IOException e){
throw new HtsjdkException("Could not create a temporary output file.", e);
@Test(dataProvider = "ReheaderBamFileTestInput")
public void testReheaderBamFile(final boolean createMd5, final boolean createIndex) throws IOException {
final File originalBam = new File(NA12878_8000);
SAMFileHeader header = SamReaderFactory.make().getFileHeader(new File(NA12878_8000));
header.addComment("This is a new, modified header");

final Path output = Files.createTempFile("output", ".bam");
BamFileIoUtils.reheaderBamFile(header, originalBam.toPath(), output, createMd5, createIndex);

// Confirm that the header has been replaced
final SamReader outputReader = SamReaderFactory.make().open(output);
Assert.assertEquals(outputReader.getFileHeader(), header);

// Confirm that the reads are the same as the original
Assert.assertTrue(compareBamReads(originalBam, output.toFile(), DEFAULT_NUM_RECORDS_TO_COMPARE)); // tsato: toFile a stop gap

if (createMd5){
Assert.assertTrue(Files.exists(output.resolveSibling(output.getFileName() + FileExtensions.MD5)));
}

if (createIndex){
Assert.assertTrue(SamReaderFactory.make().open(output).hasIndex());
}
}

/**
* Compare first (numRecordsToCompare) reads of two bam files.
* In particular we do not check for equality of the headers.
* We do not check for equality of the headers.
*
* @param numRecordsToCompare the number of reads to compare
*
* @return true if the first (numRecordsToCompare) reads are equal, else false
*/
private boolean compareBamReads(final File bam1, final File bam2, final int numRecordsToCompare){
SamReader reader = SamReaderFactory.make().open(bam1.toPath());
final Iterator<SAMRecord> originalBamIterator = SamReaderFactory.make().open(bam1.toPath()).iterator();
final Iterator<SAMRecord> outputBamIterator = SamReaderFactory.make().open(bam2.toPath()).iterator();

// tsato: per Louis...somehow failing the tests
boolean louis = false;
if (louis){
Assert.assertEquals(originalBamIterator, outputBamIterator);
}
for (int i = 0; i < numRecordsToCompare; i++){
final SAMRecord originalRead = originalBamIterator.next();
final SAMRecord outputRead = outputBamIterator.next();
if (! originalRead.equals(outputRead)){
return false;
}
}


return true;
}

// tsato: kinda ugly?
@DataProvider(name="BlockCopyBamFileTestInput")
public Object[][] getBlockCopyBamFileTestInput() {
return new Object[][] {
Expand All @@ -77,27 +104,29 @@ public Object[][] getBlockCopyBamFileTestInput() {
public void testBlockCopyBamFile(final boolean skipHeader, final boolean skipTerminator) throws IOException {
System.out.println(skipHeader + ", " + skipTerminator);
final File output = File.createTempFile("output", ".bam");
final OutputStream out = Files.newOutputStream(output.toPath());
final Path input = Paths.get(NA12878_8000);
try (final OutputStream out = Files.newOutputStream(output.toPath())) {
final Path input = Paths.get(NA12878_8000);

BamFileIoUtils.blockCopyBamFile(Paths.get(NA12878_8000), out, skipHeader, skipTerminator);
BamFileIoUtils.blockCopyBamFile(Paths.get(NA12878_8000), out, skipHeader, skipTerminator);

final SamReader inputReader = SamReaderFactory.make().open(input);
final SamReader outputReader = SamReaderFactory.make().open(output);
final SamReader inputReader = SamReaderFactory.make().open(input);
final SamReader outputReader = SamReaderFactory.make().open(output);

if (skipHeader){
SAMFileHeader h = outputReader.getFileHeader();
Assert.assertTrue(h.getReadGroups().isEmpty()); // a proxy for the header being empty
} else {
Assert.assertEquals(outputReader.getFileHeader(), inputReader.getFileHeader());
// Reading will fail when the header is absent
Assert.assertTrue(compareBamReads(input.toFile(), output, DEFAULT_NUM_RECORDS_TO_COMPARE));
}
if (skipHeader) {
SAMFileHeader h = outputReader.getFileHeader();
Assert.assertTrue(h.getReadGroups().isEmpty()); // a proxy for the header being empty
} else {
Assert.assertEquals(outputReader.getFileHeader(), inputReader.getFileHeader());
// Reading will fail when the header is absent
Assert.assertTrue(compareBamReads(input.toFile(), output, DEFAULT_NUM_RECORDS_TO_COMPARE));
}

if (skipTerminator){
BlockCompressedInputStream.FileTermination termination = BlockCompressedInputStream.checkTermination(output);
Assert.assertEquals(termination, BlockCompressedInputStream.FileTermination.HAS_HEALTHY_LAST_BLOCK);
if (skipTerminator) {
BlockCompressedInputStream.FileTermination termination = BlockCompressedInputStream.checkTermination(output);
Assert.assertEquals(termination, BlockCompressedInputStream.FileTermination.HAS_HEALTHY_LAST_BLOCK);
}
} catch (IOException e){
throw new HtsjdkException("Caught an IO exception block copying a bam file to " + output.getAbsolutePath(), e);
}

}
}
10 changes: 10 additions & 0 deletions src/test/java/htsjdk/samtools/HtsjdkTestUtils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
package htsjdk.samtools;

import htsjdk.io.HtsPath;

public class HtsjdkTestUtils {
// tsato: change to HtsPath...
public static final String TEST_DATA_DIR = "src/test/resources/htsjdk/samtools/cram/";
public static final HtsPath NA12878_8000 = new HtsPath(TEST_DATA_DIR + "CEUTrio.HiSeq.WGS.b37.NA12878.20.first.8000.bam");
public static final String NA12878_20_21 = TEST_DATA_DIR + "NA12878.20.21.unmapped.orig.bam";
}
Loading

0 comments on commit 4299d3d

Please sign in to comment.