Skip to content

Commit

Permalink
Added ability to get a VariantContexts from an InputStream (#1245)
Browse files Browse the repository at this point in the history
* new VCFIteratorBuilder allows you to convert an InputStream of a vcf/bcf into an iterator of VariantContexts
* new methods in IOUtils for identifying if a stream is gzipped or not
* deprecated SamStreams.isGzippedSAMFile and replaced it with IOUtil.isGZIPInputStream
  • Loading branch information
romanzenka authored and lbergelson committed Dec 19, 2018
1 parent 4a7cb03 commit 2473407
Show file tree
Hide file tree
Showing 8 changed files with 1,262 additions and 753 deletions.
2 changes: 1 addition & 1 deletion src/main/java/htsjdk/samtools/SamReaderFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,7 @@ public SamReader open(final SamInputResource resource) {
}
} else if (BlockCompressedInputStream.isValidFile(bufferedStream)) {
primitiveSamReader = new SAMTextReader(new BlockCompressedInputStream(bufferedStream), validationStringency, this.samRecordFactory);
} else if (SamStreams.isGzippedSAMFile(bufferedStream)) {
} else if (IOUtil.isGZIPInputStream(bufferedStream)) {
primitiveSamReader = new SAMTextReader(new GZIPInputStream(bufferedStream), validationStringency, this.samRecordFactory);
} else if (SamStreams.isCRAMFile(bufferedStream)) {
if (referenceSource == null) {
Expand Down
23 changes: 4 additions & 19 deletions src/main/java/htsjdk/samtools/SamStreams.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.util.BlockCompressedInputStream;
import htsjdk.samtools.util.BlockCompressedStreamConstants;
import htsjdk.samtools.util.IOUtil;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.zip.GZIPInputStream;

/**
* Utilities related to processing of {@link java.io.InputStream}s encoding SAM data
Expand Down Expand Up @@ -66,26 +66,11 @@ public static boolean isBAMFile(final InputStream stream)
/**
* Checks whether the file is a gzipped sam file. Returns true if it
* is and false otherwise.
* @see @link IOUtil#isGZIPInputStream(InputStream)
*/
@Deprecated
public static boolean isGzippedSAMFile(final InputStream stream) {
if (!stream.markSupported()) {
throw new IllegalArgumentException("Cannot test a stream that doesn't support marking.");
}
stream.mark(8000);

try {
final GZIPInputStream gunzip = new GZIPInputStream(stream);
final int ch = gunzip.read();
return true;
} catch (final IOException ioe) {
return false;
} finally {
try {
stream.reset();
} catch (final IOException ioe) {
throw new IllegalStateException("Could not reset stream.");
}
}
return IOUtil.isGZIPInputStream(stream);
}

// Its too expensive to examine the remote file to determine type.
Expand Down
48 changes: 39 additions & 9 deletions src/main/java/htsjdk/samtools/util/IOUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

import htsjdk.samtools.Defaults;
import htsjdk.samtools.SAMException;
import htsjdk.samtools.SamStreams;
import htsjdk.samtools.seekablestream.SeekableBufferedStream;
import htsjdk.samtools.seekablestream.SeekableFileStream;
import htsjdk.samtools.seekablestream.SeekableHTTPStream;
Expand Down Expand Up @@ -110,7 +111,6 @@ public class IOUtil {
public static final Set<String> BLOCK_COMPRESSED_EXTENSIONS = Collections.unmodifiableSet(new HashSet<String>(Arrays.asList(".gz", ".gzip", ".bgz", ".bgzf")));

private static int compressionLevel = Defaults.COMPRESSION_LEVEL;

/**
* Sets the GZip compression level for subsequent GZIPOutputStream object creation.
* @param compressionLevel 0 <= compressionLevel <= 9
Expand Down Expand Up @@ -571,12 +571,13 @@ else if (!dir.canRead()) {
* Checks that the two files are the same length, and have the same content, otherwise throws a runtime exception.
*/
public static void assertFilesEqual(final File f1, final File f2) {
try {
if (f1.length() != f2.length()) {
throw new SAMException("File " + f1 + " is " + f1.length() + " bytes but file " + f2 + " is " + f2.length() + " bytes.");
}
if (f1.length() != f2.length()) {
throw new SAMException("File " + f1 + " is " + f1.length() + " bytes but file " + f2 + " is " + f2.length() + " bytes.");
}
try (
final FileInputStream s1 = new FileInputStream(f1);
final FileInputStream s2 = new FileInputStream(f2);
) {
final byte[] buf1 = new byte[1024 * 1024];
final byte[] buf2 = new byte[1024 * 1024];
int len1;
Expand All @@ -589,12 +590,9 @@ public static void assertFilesEqual(final File f1, final File f2) {
throw new SAMException("Files " + f1 + " and " + f2 + " differ.");
}
}
s1.close();
s2.close();
} catch (IOException e) {
} catch (final IOException e) {
throw new SAMException("Exception comparing files " + f1 + " and " + f2, e);
}

}

/**
Expand Down Expand Up @@ -1157,6 +1155,38 @@ public static List<Path> filesToPaths(Collection<File> files){
return files.stream().map(File::toPath).collect(Collectors.toList());
}

/** number of bytes that will be read for the GZIP-header in the function {@link #isGZIPInputStream(InputStream)} */
public static final int GZIP_HEADER_READ_LENGTH = 8000;

/**
* Test whether a input stream looks like a GZIP input.
* @param stream the input stream.
* @return true if `stream` starts with a gzip signature
* @throws IllegalArgumentException if `stream` cannot mark or reset the stream
* @see SamStreams#isGzippedSAMFile(InputStream)
*/
public static boolean isGZIPInputStream(final InputStream stream) {
/* this function was previously implemented in SamStreams.isGzippedSAMFile */
if (!stream.markSupported()) {
throw new IllegalArgumentException("isGZIPInputStream() : Cannot test a stream that doesn't support marking.");
}
stream.mark(GZIP_HEADER_READ_LENGTH);

try {
final GZIPInputStream gunzip = new GZIPInputStream(stream);
final int ch = gunzip.read();
return true;
} catch (final IOException ioe) {
return false;
} finally {
try {
stream.reset();
} catch (final IOException ioe) {
throw new IllegalStateException("isGZIPInputStream(): Could not reset stream.");
}
}
}

/**
* Adds the extension to the given path.
*
Expand Down
19 changes: 19 additions & 0 deletions src/main/java/htsjdk/variant/bcf2/BCF2Codec.java
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ public final class BCF2Codec extends BinaryFeatureCodec<VariantContext> {
private final static int ALLOWED_MAJOR_VERSION = 2;
private final static int MIN_MINOR_VERSION = 1;

/** sizeof a BCF header (+ min/max version). Used when trying to detect when a streams starts with a bcf header */
public static final int SIZEOF_BCF_HEADER = BCFVersion.MAGIC_HEADER_START.length + 2*Byte.BYTES;


private BCFVersion bcfVersion = null;

private VCFHeader header = null;
Expand Down Expand Up @@ -477,4 +481,19 @@ protected BCF2GenotypeFieldDecoders.Decoder getGenotypeFieldDecoder(final String
private void error(final String message) throws RuntimeException {
throw new TribbleException(String.format("%s, at record %d with position %d:", message, recordNo, pos));
}

/** try to read a BCFVersion from an uncompressed BufferedInputStream.
* The buffer must be large enough to contain {@link #SIZEOF_BCF_HEADER}
*
* @param bufferedinput the uncompressed input stream
* @return the BCFVersion if it can be decoded, or null if not found.
* @throws IOException
*/
public static BCFVersion tryReadBCFVersion(final BufferedInputStream uncompressedBufferedInput) throws IOException {
uncompressedBufferedInput.mark(SIZEOF_BCF_HEADER);
final BCFVersion bcfVersion = BCFVersion.readBCFVersion(uncompressedBufferedInput);
uncompressedBufferedInput.reset();
return bcfVersion;
}

}
56 changes: 56 additions & 0 deletions src/main/java/htsjdk/variant/vcf/VCFIterator.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package htsjdk.variant.vcf;

import htsjdk.samtools.util.CloseableIterator;
import htsjdk.variant.variantcontext.VariantContext;

/**
* An iterator of `VariantContext`. This iterator can be used to
* decode VCF data on the fly .
*
* Example:
*
* <pre>
* VCFIterator r = new VCFIteratorBuilder().open(System.in);
* while (r.hasNext()) {
* System.out.println(r.next());
* }
* r.close();
* </pre>
*
* @author Pierre Lindenbaum / @yokofakun
* @see htsjdk.variant.vcf.VCFIteratorBuilder
*
*/
public interface VCFIterator extends CloseableIterator<VariantContext> {
/** Returns the VCFHeader associated with this VCF/BCF file. */
public VCFHeader getHeader();

/**
* Returns the next object but does not advance the iterator. Subsequent
* calls to peek() and next() will return the same object.
*/
public VariantContext peek();
}
Loading

0 comments on commit 2473407

Please sign in to comment.