From 4299d3d481f92c648a1df27ee626c58e224df02b Mon Sep 17 00:00:00 2001 From: Takuto Sato Date: Tue, 29 Aug 2023 11:50:30 -0400 Subject: [PATCH] testing around --- .../java/htsjdk/samtools/BAMFileReader.java | 4 +- .../java/htsjdk/samtools/BamFileIoUtils.java | 57 ++++++----- .../htsjdk/samtools/util/FileExtensions.java | 1 + .../java/htsjdk/samtools/util/IOUtil.java | 18 +++- .../samtools/BamFileIoUtilsUnitTest.java | 95 ++++++++++++------- .../java/htsjdk/samtools/HtsjdkTestUtils.java | 10 ++ .../java/htsjdk/samtools/util/IOUtilTest.java | 23 +++++ 7 files changed, 147 insertions(+), 61 deletions(-) create mode 100644 src/test/java/htsjdk/samtools/HtsjdkTestUtils.java diff --git a/src/main/java/htsjdk/samtools/BAMFileReader.java b/src/main/java/htsjdk/samtools/BAMFileReader.java index 3693eeba8c..0addba3773 100644 --- a/src/main/java/htsjdk/samtools/BAMFileReader.java +++ b/src/main/java/htsjdk/samtools/BAMFileReader.java @@ -297,8 +297,8 @@ private BAMFileReader(final BlockCompressedInputStream compressedInputStream, this.eagerDecode = eagerDecode; this.mValidationStringency = validationStringency; this.samRecordFactory = samRecordFactory; - this.mFileHeader = readHeader(this.mStream, this.mValidationStringency, source); - mFirstRecordPointer = mCompressedInputStream.getFilePointer(); + this.mFileHeader = readHeader(this.mStream, this.mValidationStringency, source); // tsato: move the "pointer" to the position of first read + mFirstRecordPointer = mCompressedInputStream.getFilePointer(); // tsato: where do we "seek"? Why did we get away without seekableStream before? } /** diff --git a/src/main/java/htsjdk/samtools/BamFileIoUtils.java b/src/main/java/htsjdk/samtools/BamFileIoUtils.java index 1bb85adc57..9db977b06e 100644 --- a/src/main/java/htsjdk/samtools/BamFileIoUtils.java +++ b/src/main/java/htsjdk/samtools/BamFileIoUtils.java @@ -12,13 +12,12 @@ import htsjdk.samtools.util.Log; import htsjdk.samtools.util.Md5CalculatingOutputStream; import htsjdk.samtools.util.RuntimeIOException; +import htsjdk.utils.ValidationUtils; import org.apache.commons.compress.utils.FileNameUtils; import java.io.File; -import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; -import java.io.InputStream; import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; @@ -46,9 +45,8 @@ public static void reheaderBamFile(final SAMFileHeader samFileHeader, final Path /** * Support File input types for backward compatibility. Use the same method with Path inputs below. */ - @Deprecated public static void reheaderBamFile(final SAMFileHeader samFileHeader, final File inputFile, final File outputFile, final boolean createMd5, final boolean createIndex) { - reheaderBamFile(samFileHeader, inputFile.toPath(), outputFile.toPath(), createMd5, createIndex); + reheaderBamFile(samFileHeader, IOUtil.toPath(inputFile), IOUtil.toPath(outputFile), createMd5, createIndex); } /** @@ -61,8 +59,10 @@ public static void reheaderBamFile(final SAMFileHeader samFileHeader, final File * @param createIndex Whether or not to create an index file for the new BAM */ public static void reheaderBamFile(final SAMFileHeader samFileHeader, final Path inputFile, final Path outputFile, final boolean createMd5, final boolean createIndex) { + ValidationUtils.nonNull(inputFile); + ValidationUtils.nonNull(outputFile); IOUtil.assertFileIsReadable(inputFile); - // IOUtil.assertFileIsWritable(outputFile); // tsato: what do I do with this... + IOUtil.assertFileIsWritable(inputFile); try { BlockCompressedInputStream.assertNonDefectivePath(inputFile); @@ -80,12 +80,8 @@ public static void reheaderBamFile(final SAMFileHeader samFileHeader, final Path } } - /** - * @deprecated as of August 2023. Use the method by the same name below with Path input - */ - @Deprecated public static void blockCopyBamFile(final File inputFile, final OutputStream outputStream, final boolean skipHeader, final boolean skipTerminator) { - blockCopyBamFile(inputFile.toPath(), outputStream, skipHeader, skipTerminator); + blockCopyBamFile(IOUtil.toPath(inputFile), outputStream, skipHeader, skipTerminator); } /** @@ -99,7 +95,8 @@ public static void blockCopyBamFile(final File inputFile, final OutputStream out public static void blockCopyBamFile(final Path inputFile, final OutputStream outputStream, final boolean skipHeader, final boolean skipTerminator) { // tsato: use CountingInputStream to replace FileInputStream. The latter has .getChannel().getPosition // The regular InputStream from Files.newInputStream() doesn't have it, but I might be able to do so by creating Channel object and avoid using CountingInputStream - try (final CountingInputStream in = new CountingInputStream(Files.newInputStream(inputFile))){ + try (final CountingInputStream in = new CountingInputStream(Files.newInputStream(inputFile)); + final SeekablePathStream in2 = new SeekablePathStream(inputFile)){ // try (final InputStream in = Files.newInputStream(inputFile)){ // a) It's good to check that the end of the file is valid and b) we need to know if there's a terminator block and not copy it if skipTerminator is true final BlockCompressedInputStream.FileTermination term = BlockCompressedInputStream.checkTermination(inputFile); @@ -107,11 +104,16 @@ public static void blockCopyBamFile(final Path inputFile, final OutputStream out throw new SAMException(inputFile.toUri() + " does not have a valid GZIP block at the end of the file."); if (skipHeader) { // tsato: this method assumes bam file...should I test cram? + // tsato: reason virtual offset of first read when the input is file doesn't need seekable stream is the blockcompressed one + // calls seekable. Then + final long vOffsetOfFirstRecord0 = SAMUtils.findVirtualOffsetOfFirstRecordInBam(inputFile.toFile()); // tsato: just for testing purposes + final long vOffsetOfFirstRecord = SAMUtils.findVirtualOffsetOfFirstRecordInBam(inputFile); + int d = 3; + final SeekablePathStream seekablePathStream = new SeekablePathStream(inputFile); - final long vOffsetOfFirstRecord = SAMUtils.findVirtualOffsetOfFirstRecordInBam(seekablePathStream); - final BlockCompressedInputStream blockIn = new BlockCompressedInputStream(seekablePathStream); + final BlockCompressedInputStream blockIn = new BlockCompressedInputStream(seekablePathStream); // Louis: Have BlockCompressedInputStream take a path? blockIn.seek(vOffsetOfFirstRecord); // This is why we give the BlockCompressedInputStream a SeekablePathStream rather than the regular InputStream - final long remainingInBlock = blockIn.available(); + final long remainingInBlock = blockIn.available(); // Louis is probably concerned that I'm passing the same seekablePathStream object // If we found the end of the header then write the remainder of this block out as a // new gzip block and then break out of the while loop @@ -122,21 +124,28 @@ public static void blockCopyBamFile(final Path inputFile, final OutputStream out // Don't close blockOut because closing underlying stream would break everything } - long pos = BlockCompressedFilePointerUtil.getBlockAddress(blockIn.getFilePointer()); + long pos = BlockCompressedFilePointerUtil.getBlockAddress(blockIn.getFilePointer()); // tsato: what exactly is a filepointer... blockIn.close(); + + in2.seek(pos); // per Louis. But CountingInputStream cannot seek. while (pos > 0) { pos -= in.skip(pos); } } + // Copy remainder of input stream into output stream final long currentPos = in.getCount(); // tsato: assuming currentPos is the position after writing the first block, this should be ok. - // final long length = inputFile.length(); - final long length = Files.size(inputFile); // tsato: rename to size + final long currentPos2 = in2.position(); + if (currentPos2 != currentPos){ + int d = 3; + } + final long length = Files.size(inputFile); final long skipLast = ((term == BlockCompressedInputStream.FileTermination.HAS_TERMINATOR_BLOCK) && skipTerminator) ? BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length : 0; final long bytesToWrite = length - skipLast - currentPos; - IOUtil.transferByStream(in, outputStream, bytesToWrite); // tsato: this method is manually buffered so make sure BufferedReader/Writer is not used (or does that matter?) + // tsato: test in2 + IOUtil.transferByStream(in2, outputStream, bytesToWrite); } catch (final IOException ioe) { throw new RuntimeIOException(ioe); } @@ -162,7 +171,7 @@ public static void gatherWithBlockCopying(final List bams, final File outp for (final File f : bams) { LOG.info(String.format("Block copying %s ...", f.getAbsolutePath())); - blockCopyBamFile(f.toPath(), out, !isFirstFile, true); + blockCopyBamFile(IOUtil.toPath(f), out, !isFirstFile, true); isFirstFile = false; } @@ -183,19 +192,17 @@ public static void gatherWithBlockCopying(final List bams, final File outp } } - @Deprecated private static OutputStream buildOutputStream(final File outputFile, final boolean createMd5, final boolean createIndex) throws IOException { - return buildOutputStream(outputFile.toPath(), createMd5, createIndex); + return buildOutputStream(IOUtil.toPath(outputFile), createMd5, createIndex); } private static OutputStream buildOutputStream(final Path outputFile, final boolean createMd5, final boolean createIndex) throws IOException { OutputStream outputStream = Files.newOutputStream(outputFile); if (createMd5) { - outputStream = new Md5CalculatingOutputStream(outputStream, Paths.get(outputFile + ".md")); // tsato: is this right? maybe need .getURI() + outputStream = new Md5CalculatingOutputStream(outputStream, IOUtil.addExtension(outputFile, FileExtensions.MD5)); } if (createIndex) { - final String baseName = FileNameUtils.getBaseName(outputFile); - outputStream = new StreamInflatingIndexingOutputStream(outputStream, Paths.get(baseName + FileExtensions.BAI_INDEX)); + outputStream = new StreamInflatingIndexingOutputStream(outputStream, outputFile.resolveSibling(outputFile.getFileName() + FileExtensions.BAI_INDEX)); } return outputStream; } @@ -203,7 +210,7 @@ private static OutputStream buildOutputStream(final Path outputFile, final boole @Deprecated private static void assertSortOrdersAreEqual(final SAMFileHeader newHeader, final File inputFile) throws IOException { - assertSortOrdersAreEqual(newHeader, inputFile.toPath()); + assertSortOrdersAreEqual(newHeader, IOUtil.toPath(inputFile)); } private static void assertSortOrdersAreEqual(final SAMFileHeader newHeader, final Path inputFile) throws IOException { diff --git a/src/main/java/htsjdk/samtools/util/FileExtensions.java b/src/main/java/htsjdk/samtools/util/FileExtensions.java index fc2e37d6c6..b67154a629 100755 --- a/src/main/java/htsjdk/samtools/util/FileExtensions.java +++ b/src/main/java/htsjdk/samtools/util/FileExtensions.java @@ -75,6 +75,7 @@ public final class FileExtensions { public static final String GZI = ".gzi"; public static final String SBI = ".sbi"; public static final String CSI = ".csi"; + public static final String MD5 = ".md5"; public static final Set BLOCK_COMPRESSED = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(".gz", ".gzip", ".bgz", ".bgzf"))); diff --git a/src/main/java/htsjdk/samtools/util/IOUtil.java b/src/main/java/htsjdk/samtools/util/IOUtil.java index d473bebbe0..0e7b9ba1a1 100644 --- a/src/main/java/htsjdk/samtools/util/IOUtil.java +++ b/src/main/java/htsjdk/samtools/util/IOUtil.java @@ -582,6 +582,21 @@ public static void assertFilesAreWritable(final List files) { for (final File file : files) assertFileIsWritable(file); } + + /** + * In some filesystems (e.g. google cloud) it may not make sense to check writability. + * This method only checks writability when it makes sense to (i.e. for now when the path points to a file + * in the local filesystem) + * + * @param path + */ + // tsato: naming this a bit differently to make clear that it works a bit differently from "assertFileIsWritable" + public static void assertFileIsWritable(final Path path){ // tsato: perhaps IOPath + if (path.toUri().getScheme().equals("file")){ + IOUtil.assertFileIsWritable(path.toFile()); + } + } + /** * Checks that a directory is non-null, extent, writable and a directory * otherwise a runtime exception is thrown. @@ -867,7 +882,8 @@ public static OutputStream openFileForMd5CalculatingWriting(final File file) { } public static OutputStream openFileForMd5CalculatingWriting(final Path file) { - return new Md5CalculatingOutputStream(IOUtil.openFileForWriting(file), file.resolve(".md5")); + final Path digestFile = file.resolveSibling(file.getFileName() + FileExtensions.MD5); + return new Md5CalculatingOutputStream(IOUtil.openFileForWriting(file), digestFile); } /** diff --git a/src/test/java/htsjdk/samtools/BamFileIoUtilsUnitTest.java b/src/test/java/htsjdk/samtools/BamFileIoUtilsUnitTest.java index 006754c75e..645a134954 100644 --- a/src/test/java/htsjdk/samtools/BamFileIoUtilsUnitTest.java +++ b/src/test/java/htsjdk/samtools/BamFileIoUtilsUnitTest.java @@ -3,6 +3,7 @@ import htsjdk.HtsjdkTest; import htsjdk.beta.exception.HtsjdkException; import htsjdk.samtools.util.BlockCompressedInputStream; +import htsjdk.samtools.util.FileExtensions; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -22,36 +23,61 @@ public class BamFileIoUtilsUnitTest extends HtsjdkTest { final static String NA12878_20_21 = TEST_DATA_DIR + "NA12878.20.21.unmapped.orig.bam"; final static int DEFAULT_NUM_RECORDS_TO_COMPARE = 10; - @Test - public void testReheaderBamFile(){ - final File originalBam = new File(this.NA12878_8000); - SAMFileHeader header = SamReaderFactory.make().getFileHeader(new File(NA12878_20_21)); - try { - final File output = File.createTempFile("output", ".bam"); - BamFileIoUtils.reheaderBamFile(header, originalBam.toPath(), output.toPath()); - // Confirm that the header has been replaced - final SamReader outputReader = SamReaderFactory.make().open(output.toPath()); - Assert.assertEquals(outputReader.getFileHeader(), header); + @DataProvider(name="ReheaderBamFileTestInput") + public Object[][] getReheaderBamFileTestInput() { // tsato: is this the right naming scheme? e.g. get(method-name)Input + // tsato: extract this as a method that takes a number of arguments and returns 2^n elements + return new Object[][] { + {true, true}, + {true, false}, + {false, true}, + {false, false} + }; + } - // Confirm that the reads are the same as the original - Assert.assertTrue(compareBamReads(originalBam, output, DEFAULT_NUM_RECORDS_TO_COMPARE)); - } catch (IOException e){ - throw new HtsjdkException("Could not create a temporary output file.", e); + @Test(dataProvider = "ReheaderBamFileTestInput") + public void testReheaderBamFile(final boolean createMd5, final boolean createIndex) throws IOException { + final File originalBam = new File(NA12878_8000); + SAMFileHeader header = SamReaderFactory.make().getFileHeader(new File(NA12878_8000)); + header.addComment("This is a new, modified header"); + + final Path output = Files.createTempFile("output", ".bam"); + BamFileIoUtils.reheaderBamFile(header, originalBam.toPath(), output, createMd5, createIndex); + + // Confirm that the header has been replaced + final SamReader outputReader = SamReaderFactory.make().open(output); + Assert.assertEquals(outputReader.getFileHeader(), header); + + // Confirm that the reads are the same as the original + Assert.assertTrue(compareBamReads(originalBam, output.toFile(), DEFAULT_NUM_RECORDS_TO_COMPARE)); // tsato: toFile a stop gap + + if (createMd5){ + Assert.assertTrue(Files.exists(output.resolveSibling(output.getFileName() + FileExtensions.MD5))); + } + + if (createIndex){ + Assert.assertTrue(SamReaderFactory.make().open(output).hasIndex()); } } /** * Compare first (numRecordsToCompare) reads of two bam files. - * In particular we do not check for equality of the headers. + * We do not check for equality of the headers. * * @param numRecordsToCompare the number of reads to compare * * @return true if the first (numRecordsToCompare) reads are equal, else false */ private boolean compareBamReads(final File bam1, final File bam2, final int numRecordsToCompare){ + SamReader reader = SamReaderFactory.make().open(bam1.toPath()); final Iterator originalBamIterator = SamReaderFactory.make().open(bam1.toPath()).iterator(); final Iterator outputBamIterator = SamReaderFactory.make().open(bam2.toPath()).iterator(); + + // tsato: per Louis...somehow failing the tests + boolean louis = false; + if (louis){ + Assert.assertEquals(originalBamIterator, outputBamIterator); + } for (int i = 0; i < numRecordsToCompare; i++){ final SAMRecord originalRead = originalBamIterator.next(); final SAMRecord outputRead = outputBamIterator.next(); @@ -59,10 +85,11 @@ private boolean compareBamReads(final File bam1, final File bam2, final int numR return false; } } + + return true; } - // tsato: kinda ugly? @DataProvider(name="BlockCopyBamFileTestInput") public Object[][] getBlockCopyBamFileTestInput() { return new Object[][] { @@ -77,27 +104,29 @@ public Object[][] getBlockCopyBamFileTestInput() { public void testBlockCopyBamFile(final boolean skipHeader, final boolean skipTerminator) throws IOException { System.out.println(skipHeader + ", " + skipTerminator); final File output = File.createTempFile("output", ".bam"); - final OutputStream out = Files.newOutputStream(output.toPath()); - final Path input = Paths.get(NA12878_8000); + try (final OutputStream out = Files.newOutputStream(output.toPath())) { + final Path input = Paths.get(NA12878_8000); - BamFileIoUtils.blockCopyBamFile(Paths.get(NA12878_8000), out, skipHeader, skipTerminator); + BamFileIoUtils.blockCopyBamFile(Paths.get(NA12878_8000), out, skipHeader, skipTerminator); - final SamReader inputReader = SamReaderFactory.make().open(input); - final SamReader outputReader = SamReaderFactory.make().open(output); + final SamReader inputReader = SamReaderFactory.make().open(input); + final SamReader outputReader = SamReaderFactory.make().open(output); - if (skipHeader){ - SAMFileHeader h = outputReader.getFileHeader(); - Assert.assertTrue(h.getReadGroups().isEmpty()); // a proxy for the header being empty - } else { - Assert.assertEquals(outputReader.getFileHeader(), inputReader.getFileHeader()); - // Reading will fail when the header is absent - Assert.assertTrue(compareBamReads(input.toFile(), output, DEFAULT_NUM_RECORDS_TO_COMPARE)); - } + if (skipHeader) { + SAMFileHeader h = outputReader.getFileHeader(); + Assert.assertTrue(h.getReadGroups().isEmpty()); // a proxy for the header being empty + } else { + Assert.assertEquals(outputReader.getFileHeader(), inputReader.getFileHeader()); + // Reading will fail when the header is absent + Assert.assertTrue(compareBamReads(input.toFile(), output, DEFAULT_NUM_RECORDS_TO_COMPARE)); + } - if (skipTerminator){ - BlockCompressedInputStream.FileTermination termination = BlockCompressedInputStream.checkTermination(output); - Assert.assertEquals(termination, BlockCompressedInputStream.FileTermination.HAS_HEALTHY_LAST_BLOCK); + if (skipTerminator) { + BlockCompressedInputStream.FileTermination termination = BlockCompressedInputStream.checkTermination(output); + Assert.assertEquals(termination, BlockCompressedInputStream.FileTermination.HAS_HEALTHY_LAST_BLOCK); + } + } catch (IOException e){ + throw new HtsjdkException("Caught an IO exception block copying a bam file to " + output.getAbsolutePath(), e); } - } } \ No newline at end of file diff --git a/src/test/java/htsjdk/samtools/HtsjdkTestUtils.java b/src/test/java/htsjdk/samtools/HtsjdkTestUtils.java new file mode 100644 index 0000000000..7d1c51f83d --- /dev/null +++ b/src/test/java/htsjdk/samtools/HtsjdkTestUtils.java @@ -0,0 +1,10 @@ +package htsjdk.samtools; + +import htsjdk.io.HtsPath; + +public class HtsjdkTestUtils { + // tsato: change to HtsPath... + public static final String TEST_DATA_DIR = "src/test/resources/htsjdk/samtools/cram/"; + public static final HtsPath NA12878_8000 = new HtsPath(TEST_DATA_DIR + "CEUTrio.HiSeq.WGS.b37.NA12878.20.first.8000.bam"); + public static final String NA12878_20_21 = TEST_DATA_DIR + "NA12878.20.21.unmapped.orig.bam"; +} diff --git a/src/test/java/htsjdk/samtools/util/IOUtilTest.java b/src/test/java/htsjdk/samtools/util/IOUtilTest.java index 8354b5ba29..6a64a6b957 100644 --- a/src/test/java/htsjdk/samtools/util/IOUtilTest.java +++ b/src/test/java/htsjdk/samtools/util/IOUtilTest.java @@ -35,7 +35,14 @@ import java.nio.file.Paths; import java.nio.file.spi.FileSystemProvider; +import htsjdk.beta.exception.HtsjdkException; +import htsjdk.samtools.BAMFileWriter; +import htsjdk.samtools.BamFileIoUtils; +import htsjdk.samtools.HtsjdkTestUtils; import htsjdk.samtools.SAMException; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SamReaderFactory; +import org.apache.commons.compress.compressors.FileNameUtil; import org.testng.Assert; import org.testng.annotations.AfterClass; import org.testng.annotations.BeforeClass; @@ -796,5 +803,21 @@ public void isGZIPInputStreamTest(byte[] data, boolean isGzipped) throws IOExcep Assert.assertEquals(IOUtil.isGZIPInputStream(inputStream), isGzipped); } } + + @Test + public void testOpenFileForMd5CalculatingWriting() throws IOException { + Path output = Files.createTempFile("test", FileExtensions.BAM); + + try (final OutputStream outputStream = IOUtil.openFileForMd5CalculatingWriting(output)){ + // tsato: perhaps BamFileIoUtils is a better place for this test + BamFileIoUtils.blockCopyBamFile(HtsjdkTestUtils.NA12878_8000.toPath(), outputStream, false, false); + } catch (IOException e) { + throw new HtsjdkException("Encountered an IO error", e); + } + + final String md5FileName = output.getFileName() + FileExtensions.MD5; + Assert.assertTrue(md5FileName.endsWith(".bam.md5")); + Assert.assertTrue(Files.exists(output.resolveSibling(md5FileName))); + } }