Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update some methods to accept Path input #1681

Merged
merged 14 commits into from
Oct 11, 2023
101 changes: 64 additions & 37 deletions src/main/java/htsjdk/samtools/BamFileIoUtils.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package htsjdk.samtools;

import htsjdk.beta.exception.HtsjdkException;
import htsjdk.samtools.seekablestream.SeekablePathStream;
import htsjdk.samtools.util.BlockCompressedFilePointerUtil;
import htsjdk.samtools.util.BlockCompressedInputStream;
import htsjdk.samtools.util.BlockCompressedOutputStream;
Expand All @@ -10,13 +12,15 @@
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.Md5CalculatingOutputStream;
import htsjdk.samtools.util.RuntimeIOException;
import htsjdk.utils.ValidationUtils;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;

public class BamFileIoUtils {
Expand All @@ -32,10 +36,18 @@ public static boolean isBamFile(final File file) {
return ((file != null) && SamReader.Type.BAM_TYPE.hasValidFileExtension(file.getName()));
}

public static void reheaderBamFile(final SAMFileHeader samFileHeader, final File inputFile, final File outputFile) {
public static void reheaderBamFile(final SAMFileHeader samFileHeader, final Path inputFile, final Path outputFile) {
reheaderBamFile(samFileHeader, inputFile, outputFile, true, true);
}


/**
* Support File input types for backward compatibility. Use the same method with Path inputs below.
*/
public static void reheaderBamFile(final SAMFileHeader samFileHeader, final File inputFile, final File outputFile, final boolean createMd5, final boolean createIndex) {
reheaderBamFile(samFileHeader, IOUtil.toPath(inputFile), IOUtil.toPath(outputFile), createMd5, createIndex);
}

/**
* Copy a BAM file but replacing the header
*
Expand All @@ -45,12 +57,14 @@ public static void reheaderBamFile(final SAMFileHeader samFileHeader, final File
* @param createMd5 Whether or not to create an MD5 file for the new BAM
* @param createIndex Whether or not to create an index file for the new BAM
*/
public static void reheaderBamFile(final SAMFileHeader samFileHeader, final File inputFile, final File outputFile, final boolean createMd5, final boolean createIndex) {
public static void reheaderBamFile(final SAMFileHeader samFileHeader, final Path inputFile, final Path outputFile, final boolean createMd5, final boolean createIndex) {
ValidationUtils.nonNull(inputFile);
ValidationUtils.nonNull(outputFile);
IOUtil.assertFileIsReadable(inputFile);
IOUtil.assertFileIsWritable(outputFile);
IOUtil.assertFileIsWritable(inputFile);

try {
BlockCompressedInputStream.assertNonDefectiveFile(inputFile);
BlockCompressedInputStream.assertNonDefectivePath(inputFile);
assertSortOrdersAreEqual(samFileHeader, inputFile);

final OutputStream outputStream = buildOutputStream(outputFile, createMd5, createIndex);
Expand All @@ -65,58 +79,61 @@ public static void reheaderBamFile(final SAMFileHeader samFileHeader, final File
}
}

public static void blockCopyBamFile(final File inputFile, final OutputStream outputStream, final boolean skipHeader, final boolean skipTerminator) {
blockCopyBamFile(IOUtil.toPath(inputFile), outputStream, skipHeader, skipTerminator);
}

/**
* Copy data from a BAM file to an OutputStream by directly copying the gzip blocks
* Copy data from a BAM file to an OutputStream by directly copying the gzip blocks.
*
* @param inputFile The file to be copied
* @param inputFile The BAM file to be copied
* @param outputStream The stream to write the copied data to
* @param skipHeader If true, the header of the input file will not be copied to the output stream
* @param skipTerminator If true, the terminator block of the input file will not be written to the output stream
*/
public static void blockCopyBamFile(final File inputFile, final OutputStream outputStream, final boolean skipHeader, final boolean skipTerminator) {
FileInputStream in = null;
try {
in = new FileInputStream(inputFile);

public static void blockCopyBamFile(final Path inputFile, final OutputStream outputStream, final boolean skipHeader, final boolean skipTerminator) {
try (final SeekablePathStream in = new SeekablePathStream(inputFile)){
// a) It's good to check that the end of the file is valid and b) we need to know if there's a terminator block and not copy it if skipTerminator is true
final BlockCompressedInputStream.FileTermination term = BlockCompressedInputStream.checkTermination(inputFile);
if (term == BlockCompressedInputStream.FileTermination.DEFECTIVE)
throw new SAMException(inputFile.getAbsolutePath() + " does not have a valid GZIP block at the end of the file.");
throw new SAMException(inputFile.toUri() + " does not have a valid GZIP block at the end of the file.");

if (skipHeader) {
final long vOffsetOfFirstRecord = SAMUtils.findVirtualOffsetOfFirstRecordInBam(inputFile);
final BlockCompressedInputStream blockIn = new BlockCompressedInputStream(inputFile);
blockIn.seek(vOffsetOfFirstRecord);
final long remainingInBlock = blockIn.available();

// If we found the end of the header then write the remainder of this block out as a
// new gzip block and then break out of the while loop
if (remainingInBlock >= 0) {
final BlockCompressedOutputStream blockOut = new BlockCompressedOutputStream(outputStream, (Path)null);
IOUtil.transferByStream(blockIn, blockOut, remainingInBlock);
blockOut.flush();
// Don't close blockOut because closing underlying stream would break everything
}

long pos = BlockCompressedFilePointerUtil.getBlockAddress(blockIn.getFilePointer());
blockIn.close();
while (pos > 0) {
pos -= in.skip(pos);
// tsato: curious --- why do we need BlockCompressedInputStream at all here?
try (final BlockCompressedInputStream blockIn = new BlockCompressedInputStream(inputFile)) {
blockIn.seek(vOffsetOfFirstRecord);
final long remainingInBlock = blockIn.available();

// If we found the end of the header then write the remainder of this block out as a
// new gzip block and then break out of the while loop (tsato: update this comment)
if (remainingInBlock >= 0) {
final BlockCompressedOutputStream blockOut = new BlockCompressedOutputStream(outputStream, (Path) null);
IOUtil.transferByStream(blockIn, blockOut, remainingInBlock);
blockOut.flush();
// Don't close blockOut because closing underlying stream would break everything
}

final long pos = BlockCompressedFilePointerUtil.getBlockAddress(blockIn.getFilePointer());
blockIn.close(); // tsato: why doesn't IntelliJ say this is unnecessary?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could leave out that close. I don't know why intellij isn't telling you that but it looks like it's redundant to me.


in.seek(pos);
} catch (IOException e){
throw new HtsjdkException("Encountered an error.", e);
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lets replace this while() loop with a single in.seek(pos)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

}

// Copy remainder of input stream into output stream
final long currentPos = in.getChannel().position();
final long length = inputFile.length();
final long currentPos = in.position();
final long length = Files.size(inputFile);
final long skipLast = ((term == BlockCompressedInputStream.FileTermination.HAS_TERMINATOR_BLOCK) && skipTerminator) ?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think length is still perfectly clear.

BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length : 0;
final long bytesToWrite = length - skipLast - currentPos;

IOUtil.transferByStream(in, outputStream, bytesToWrite);
} catch (final IOException ioe) {
throw new RuntimeIOException(ioe);
} finally {
CloserUtil.close(in);
}
}

Expand All @@ -140,7 +157,7 @@ public static void gatherWithBlockCopying(final List<File> bams, final File outp

for (final File f : bams) {
LOG.info(String.format("Block copying %s ...", f.getAbsolutePath()));
blockCopyBamFile(f, out, !isFirstFile, true);
blockCopyBamFile(IOUtil.toPath(f), out, !isFirstFile, true);
isFirstFile = false;
}

Expand All @@ -162,17 +179,27 @@ public static void gatherWithBlockCopying(final List<File> bams, final File outp
}

private static OutputStream buildOutputStream(final File outputFile, final boolean createMd5, final boolean createIndex) throws IOException {
OutputStream outputStream = new FileOutputStream(outputFile);
return buildOutputStream(IOUtil.toPath(outputFile), createMd5, createIndex);
}

private static OutputStream buildOutputStream(final Path outputFile, final boolean createMd5, final boolean createIndex) throws IOException {
OutputStream outputStream = Files.newOutputStream(outputFile);
if (createMd5) {
outputStream = new Md5CalculatingOutputStream(outputStream, new File(outputFile.getAbsolutePath() + ".md5"));
outputStream = new Md5CalculatingOutputStream(outputStream, IOUtil.addExtension(outputFile, FileExtensions.MD5));
}
if (createIndex) {
outputStream = new StreamInflatingIndexingOutputStream(outputStream, new File(outputFile.getParentFile(), IOUtil.basename(outputFile) + FileExtensions.BAI_INDEX));
outputStream = new StreamInflatingIndexingOutputStream(outputStream, outputFile.resolveSibling(outputFile.getFileName() + FileExtensions.BAI_INDEX));
}
return outputStream;
}


@Deprecated
private static void assertSortOrdersAreEqual(final SAMFileHeader newHeader, final File inputFile) throws IOException {
assertSortOrdersAreEqual(newHeader, IOUtil.toPath(inputFile));
}

private static void assertSortOrdersAreEqual(final SAMFileHeader newHeader, final Path inputFile) throws IOException {
final SamReader reader = SamReaderFactory.makeDefault().open(inputFile);
final SAMFileHeader origHeader = reader.getFileHeader();
final SAMFileHeader.SortOrder newSortOrder = newHeader.getSortOrder();
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/htsjdk/samtools/SAMRecordIterator.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
/**
* A general interface that adds functionality to a CloseableIterator of
* SAMRecords. Currently, this interface is implemented by iterators that
* want to validate as they are iterating that that the records in the
* want to validate as they are iterating that the records in the
* underlying SAM/BAM file are in a particular order.
*/
public interface SAMRecordIterator extends CloseableIterator<SAMRecord> {
Expand Down
11 changes: 11 additions & 0 deletions src/main/java/htsjdk/samtools/SAMUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
*/
package htsjdk.samtools;

import htsjdk.samtools.seekablestream.SeekablePathStream;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.util.BinaryCodec;
import htsjdk.samtools.util.CigarUtil;
Expand All @@ -36,6 +37,7 @@
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.math.BigInteger;
import java.nio.file.Path;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
Expand Down Expand Up @@ -677,6 +679,15 @@ public static int combineMapqs(int m1, int m2) {

}


public static long findVirtualOffsetOfFirstRecordInBam(final Path bamFile) {
try (SeekableStream ss = new SeekablePathStream(bamFile)){
return BAMFileReader.findVirtualOffsetOfFirstRecord(ss);
} catch (final IOException ioe) {
throw new RuntimeEOFException(ioe);
}
}

/**
* Returns the virtual file offset of the first record in a BAM file - i.e. the virtual file
* offset after skipping over the text header and the sequence records.
Expand Down
12 changes: 8 additions & 4 deletions src/main/java/htsjdk/samtools/SamFileValidator.java
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.nio.file.Path;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Collection;
Expand Down Expand Up @@ -209,20 +210,23 @@ public boolean validateSamFileVerbose(final SamReader samReader, final Reference
}

public void validateBamFileTermination(final File inputFile) {
validateBamFileTermination(IOUtil.toPath(inputFile));
}

public void validateBamFileTermination(final Path inputFile) {
try {
if (!IOUtil.isBlockCompressed(inputFile.toPath())) {
if (!IOUtil.isBlockCompressed(inputFile)) {
return;
}
final BlockCompressedInputStream.FileTermination terminationState =
BlockCompressedInputStream.checkTermination(inputFile);
if (terminationState.equals(BlockCompressedInputStream.FileTermination.DEFECTIVE)) {
addError(new SAMValidationError(Type.TRUNCATED_FILE, "BAM file has defective last gzip block",
inputFile.getPath()));
inputFile.toUri().toString()));
} else if (terminationState.equals(BlockCompressedInputStream.FileTermination.HAS_HEALTHY_LAST_BLOCK)) {
addError(new SAMValidationError(Type.BAM_FILE_MISSING_TERMINATOR_BLOCK,
"Older BAM file -- does not have terminator block",
inputFile.getPath()));

inputFile.toUri().toString()));
}
} catch (IOException e) {
throw new SAMException("IOException", e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import htsjdk.samtools.seekablestream.SeekableBufferedStream;
import htsjdk.samtools.seekablestream.SeekableFileStream;
import htsjdk.samtools.seekablestream.SeekableHTTPStream;
import htsjdk.samtools.seekablestream.SeekablePathStream;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.util.zip.InflaterFactory;

Expand Down Expand Up @@ -123,6 +124,15 @@ public BlockCompressedInputStream(final File file) throws IOException {
this(file, BlockGunzipper.getDefaultInflaterFactory());
}


/**
* Equivalent constructor for Path as the one that takes a File. Supports seeking.
*/
public BlockCompressedInputStream(final Path file) throws IOException {
this(new SeekablePathStream(file));
}


/**
* Use this ctor if you wish to call seek()
* @param file source of bytes
Expand All @@ -135,6 +145,7 @@ public BlockCompressedInputStream(final File file, final InflaterFactory inflate
blockGunzipper = new BlockGunzipper(inflaterFactory);
}


/**
* @param url source of bytes
*/
Expand Down Expand Up @@ -704,9 +715,14 @@ static void readFully(SeekableByteChannel channel, ByteBuffer dst) throws IOExce
}
}

@Deprecated
public static void assertNonDefectiveFile(final File file) throws IOException {
assertNonDefectivePath(IOUtil.toPath(file));
}

public static void assertNonDefectivePath(final Path file) throws IOException {
if (checkTermination(file) == FileTermination.DEFECTIVE) {
throw new SAMException(file.getAbsolutePath() + " does not have a valid GZIP block at the end of the file.");
throw new SAMException(file.toUri() + " does not have a valid GZIP block at the end of the file.");
}
}

Expand Down
1 change: 1 addition & 0 deletions src/main/java/htsjdk/samtools/util/FileExtensions.java
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ public final class FileExtensions {
public static final String GZI = ".gzi";
public static final String SBI = ".sbi";
public static final String CSI = ".csi";
public static final String MD5 = ".md5";

public static final Set<String> BLOCK_COMPRESSED = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(".gz", ".gzip", ".bgz", ".bgzf")));

Expand Down
15 changes: 14 additions & 1 deletion src/main/java/htsjdk/samtools/util/IOUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,18 @@ public static void assertFilesAreWritable(final List<File> files) {
for (final File file : files) assertFileIsWritable(file);
}


/**
* In some filesystems (e.g. google cloud) it may not make sense to check writability.
* This method only checks writability when it's (i.e. for now when the path points to a file
* in the local filesystem)
*/
public static void assertFileIsWritable(final Path path){ // tsato: perhaps the input type should be IOPath
if (path.toUri().getScheme().equals("file")){
IOUtil.assertFileIsWritable(path.toFile());
}
}

/**
* Checks that a directory is non-null, extent, writable and a directory
* otherwise a runtime exception is thrown.
Expand Down Expand Up @@ -867,7 +879,8 @@ public static OutputStream openFileForMd5CalculatingWriting(final File file) {
}

public static OutputStream openFileForMd5CalculatingWriting(final Path file) {
return new Md5CalculatingOutputStream(IOUtil.openFileForWriting(file), file.resolve(".md5"));
final Path digestFile = file.resolveSibling(file.getFileName() + FileExtensions.MD5);
return new Md5CalculatingOutputStream(IOUtil.openFileForWriting(file), digestFile);
}

/**
Expand Down
Loading