diff --git a/src/main/java/htsjdk/utils/Utils.java b/src/main/java/htsjdk/utils/Utils.java new file mode 100644 index 0000000000..97e07113b0 --- /dev/null +++ b/src/main/java/htsjdk/utils/Utils.java @@ -0,0 +1,45 @@ +package htsjdk.utils; + +import java.util.function.Supplier; + +public class Utils { + /** + * Checks that an Object {@code object} is not null and returns the same object or throws an {@link IllegalArgumentException} + * @param object any Object + * @return the same object + * @throws IllegalArgumentException if a {@code o == null} + */ + public static T nonNull(final T object) { + return Utils.nonNull(object, "Null object is not allowed here."); + } + + /** + * Checks that an {@link Object} is not {@code null} and returns the same object or throws an {@link IllegalArgumentException} + * @param object any Object + * @param message the text message that would be passed to the exception thrown when {@code o == null}. + * @return the same object + * @throws IllegalArgumentException if a {@code o == null} + */ + public static T nonNull(final T object, final String message) { + if (object == null) { + throw new IllegalArgumentException(message); + } + return object; + } + + /** + * Checks that an {@link Object} is not {@code null} and returns the same object or throws an {@link IllegalArgumentException} + * @param object any Object + * @param message the text message that would be passed to the exception thrown when {@code o == null}. + * @return the same object + * @throws IllegalArgumentException if a {@code o == null} + */ + public static T nonNull(final T object, final Supplier message) { + if (object == null) { + throw new IllegalArgumentException(message.get()); + } + return object; + } + + +} diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Codec.java b/src/main/java/htsjdk/variant/bcf2/BCF2Codec.java index 4926c80fe1..ca0bd82e83 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2Codec.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Codec.java @@ -150,6 +150,10 @@ public FeatureCodecHeader readHeader( final PositionalBufferedStream inputStream if ( bcfVersion.getMajorVersion() != ALLOWED_MAJOR_VERSION ) error("BCF2Codec can only process BCF2 files, this file has major version " + bcfVersion.getMajorVersion()); + + // TODO: fixing this breaks GATK GenomicsDB integration/tests + // Require the minor version to match exactly + //if ( bcfVersion.getMinorVersion() != MIN_MINOR_VERSION ) if ( bcfVersion.getMinorVersion() < MIN_MINOR_VERSION ) error("BCF2Codec can only process BCF2 files with minor version >= " + MIN_MINOR_VERSION + " but this file has minor version " + bcfVersion.getMinorVersion()); @@ -206,6 +210,10 @@ public FeatureCodecHeader readHeader( final PositionalBufferedStream inputStream @Override public boolean canDecode( final String path ) { + // TODO: this is broken in a couple of ways: + // First, the version check is too permissive - it accepts any minor version, including BCF 2.2, + // which it shouldn't. Second, it doesn't recognize that BCF can be block gzipped, so it rejects + // those files because the header never matches, but only because the stream isn't decompressed. try (InputStream fis = Files.newInputStream(IOUtil.getPath(path)) ){ final BCFVersion version = BCFVersion.readBCFVersion(fis); return version != null && version.getMajorVersion() == ALLOWED_MAJOR_VERSION; diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Utils.java b/src/main/java/htsjdk/variant/bcf2/BCF2Utils.java index f2fb1a8e2d..156969c3f4 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2Utils.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Utils.java @@ -29,7 +29,6 @@ import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; -import htsjdk.variant.vcf.VCFIDHeaderLine; import java.io.File; import java.io.FileNotFoundException; @@ -95,10 +94,9 @@ public static ArrayList makeDictionary(final VCFHeader header) { // set up the strings dictionary for ( VCFHeaderLine line : header.getMetaDataInInputOrder() ) { if ( line.shouldBeAddedToDictionary() ) { - final VCFIDHeaderLine idLine = (VCFIDHeaderLine)line; - if ( ! seen.contains(idLine.getID())) { - dict.add(idLine.getID()); - seen.add(idLine.getID()); + if ( ! seen.contains(line.getID())) { + dict.add(line.getID()); + seen.add(line.getID()); } } } @@ -293,7 +291,7 @@ else if ( o.getClass().isArray() ) { * Are the elements and their order in the output and input headers consistent so that * we can write out the raw genotypes block without decoding and recoding it? * - * If the order of INFO, FILTER, or contrig elements in the output header is different than + * If the order of INFO, FILTER, or contig elements in the output header is different than * in the input header we must decode the blocks using the input header and then recode them * based on the new output order. * @@ -310,15 +308,15 @@ public static boolean headerLinesAreOrderedConsistently(final VCFHeader outputHe if ( ! nullAsEmpty(outputHeader.getSampleNamesInOrder()).equals(nullAsEmpty(genotypesBlockHeader.getSampleNamesInOrder())) ) return false; - final Iterator outputLinesIt = outputHeader.getIDHeaderLines().iterator(); - final Iterator inputLinesIt = genotypesBlockHeader.getIDHeaderLines().iterator(); + final Iterator outputLinesIt = outputHeader.getStructuredHeaderLines().iterator(); + final Iterator inputLinesIt = genotypesBlockHeader.getStructuredHeaderLines().iterator(); while ( inputLinesIt.hasNext() ) { if ( ! outputLinesIt.hasNext() ) // missing lines in output return false; - final VCFIDHeaderLine outputLine = outputLinesIt.next(); - final VCFIDHeaderLine inputLine = inputLinesIt.next(); + final VCFHeaderLine outputLine = outputLinesIt.next(); + final VCFHeaderLine inputLine = inputLinesIt.next(); if ( ! inputLine.getClass().equals(outputLine.getClass()) || ! inputLine.getID().equals(outputLine.getID()) ) return false; diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/VCFWriter.java b/src/main/java/htsjdk/variant/variantcontext/writer/VCFWriter.java index 71aef13424..ed5792b93c 100644 --- a/src/main/java/htsjdk/variant/variantcontext/writer/VCFWriter.java +++ b/src/main/java/htsjdk/variant/variantcontext/writer/VCFWriter.java @@ -26,8 +26,11 @@ package htsjdk.variant.variantcontext.writer; import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.util.Log; import htsjdk.samtools.util.RuntimeIOException; +import htsjdk.tribble.TribbleException; import htsjdk.tribble.index.IndexCreator; +import htsjdk.utils.Utils; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.vcf.VCFConstants; @@ -35,6 +38,7 @@ import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFHeaderVersion; +import htsjdk.variant.vcf.VCFUtils; import java.io.BufferedWriter; import java.io.ByteArrayOutputStream; @@ -43,14 +47,15 @@ import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Writer; +import java.util.stream.Collectors; /** * this class writes VCF files */ class VCFWriter extends IndexingVariantContextWriter { + protected final static Log logger = Log.getInstance(VCFWriter.class); - private static final String VERSION_LINE = - VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF4_2.getFormatString() + "=" + VCFHeaderVersion.VCF4_2.getVersionString(); + private static final String DEFAULT_VERSION_LINE = VCFHeader.METADATA_INDICATOR + VCFHeader.DEFAULT_VCF_VERSION.getVersionLine(); // Initialized when the header is written to the output stream private VCFEncoder vcfEncoder = null; @@ -141,7 +146,7 @@ public void writeHeader(final VCFHeader header) { } public static String getVersionLine() { - return VERSION_LINE; + return DEFAULT_VERSION_LINE; } public static VCFHeader writeHeader(VCFHeader header, @@ -152,12 +157,18 @@ public static VCFHeader writeHeader(VCFHeader header, header = doNotWriteGenotypes ? new VCFHeader(header.getMetaDataInSortedOrder()) : header; try { - // the file format field needs to be written first + // Validate that the file version we're writing is version-compatible this header's version. + validateHeaderVersion(header, versionLine); + + // The file format field needs to be written first; below any file format lines + // embedded in the header will be removed writer.write(versionLine + "\n"); for (final VCFHeaderLine line : header.getMetaDataInSortedOrder() ) { - if ( VCFHeaderVersion.isFormatString(line.getKey()) ) + // Remove the fileformat header lines + if ( VCFHeaderVersion.isFormatString(line.getKey()) ) { continue; + } writer.write(VCFHeader.METADATA_INDICATOR); writer.write(line.toString()); @@ -166,14 +177,9 @@ public static VCFHeader writeHeader(VCFHeader header, // write out the column line writer.write(VCFHeader.HEADER_INDICATOR); - boolean isFirst = true; - for (final VCFHeader.HEADER_FIELDS field : header.getHeaderFields() ) { - if ( isFirst ) - isFirst = false; // don't write out a field separator - else - writer.write(VCFConstants.FIELD_SEPARATOR); - writer.write(field.toString()); - } + writer.write(header.getHeaderFields().stream() + .map(f -> f.name()) + .collect(Collectors.joining(VCFConstants.FIELD_SEPARATOR)).toString()); if ( header.hasGenotypingData() ) { writer.write(VCFConstants.FIELD_SEPARATOR); @@ -194,6 +200,32 @@ public static VCFHeader writeHeader(VCFHeader header, return header; } + /** + * Given a header and a target output version, see if the header's version is compatible with the + * requested version. + * @param header + * @param versionLine + */ + private static void validateHeaderVersion(final VCFHeader header, final String versionLine) { + Utils.nonNull(header); + Utils.nonNull(versionLine); + + final VCFHeaderVersion vcfVersion = header.getHeaderVersion(); + if (!vcfVersion.equals(VCFHeaderVersion.getHeaderVersion(versionLine))) { + final String message = String.format("Attempt to write a version %s VCF header to a version %s VCF output", + vcfVersion.getVersionString(), + versionLine); + if (VCFHeaderVersion.versionsAreCompatible(VCFHeaderVersion.getHeaderVersion(versionLine), vcfVersion)) { + if (VCFUtils.getStrictVCFVersionValidation()) { + throw new TribbleException(message); + } + } + if (VCFUtils.getVerboseVCFLogging()) { + logger.warn(message); + } + } + } + /** * attempt to close the VCF file */ diff --git a/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java b/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java index 8a55a19462..afe64a123d 100644 --- a/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java +++ b/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java @@ -26,13 +26,16 @@ package htsjdk.variant.vcf; import htsjdk.samtools.util.BlockCompressedInputStream; +import htsjdk.samtools.util.Log; import htsjdk.samtools.util.IOUtil; import htsjdk.tribble.AsciiFeatureCodec; import htsjdk.tribble.Feature; import htsjdk.tribble.NameAwareCodec; import htsjdk.tribble.TribbleException; import htsjdk.tribble.index.tabix.TabixFormat; +import htsjdk.tribble.readers.LineIterator; import htsjdk.tribble.util.ParsingUtils; +import htsjdk.utils.Utils; import htsjdk.variant.utils.GeneralUtils; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; @@ -42,7 +45,6 @@ import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; -import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; @@ -50,6 +52,7 @@ import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; @@ -60,10 +63,13 @@ import java.util.StringTokenizer; import java.util.zip.GZIPInputStream; +import static htsjdk.variant.vcf.VCFConstants.*; + public abstract class AbstractVCFCodec extends AsciiFeatureCodec implements NameAwareCodec { - public final static int MAX_ALLELE_SIZE_BEFORE_WARNING = (int)Math.pow(2, 20); + protected final static Log logger = Log.getInstance(AbstractVCFCodec.class); + public final static int MAX_ALLELE_SIZE_BEFORE_WARNING = (int)Math.pow(2, 20); protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th column // we have to store the list of strings that make up the header until they're needed @@ -71,26 +77,22 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec protected VCFHeaderVersion version = null; // a mapping of the allele - protected Map> alleleMap = new HashMap>(3); + protected final Map> alleleMap = new HashMap<>(3); - // for performance testing purposes - public static boolean validate = true; - // a key optimization -- we need a per thread string parts array, so we don't allocate a big array over and over // todo: make this thread safe? protected String[] parts = null; protected String[] genotypeParts = null; - protected final String[] locParts = new String[6]; // for performance we cache the hashmap of filter encodings for quick lookup - protected HashMap> filterHash = new HashMap>(); + protected final HashMap> filterHash = new HashMap<>(); // we store a name to give to each of the variant contexts we emit protected String name = "Unknown"; protected int lineNo = 0; - protected Map stringCache = new HashMap(); + protected final Map stringCache = new HashMap<>(); protected boolean warnedAboutNoEqualsForNonFlag = false; @@ -128,17 +130,76 @@ class LazyVCFGenotypesParser implements LazyGenotypesContext.LazyParser { @Override public LazyGenotypesContext.LazyData parse(final Object data) { - //System.out.printf("Loading genotypes... %s:%d%n", contig, start); return createGenotypeMap((String) data, alleles, contig, start); } } /** - * parse the filter string, first checking to see if we already have parsed it in a previous attempt - * @param filterString the string to parse - * @return a set of the filters applied + * Return true if this codec can handle the target version + * @param targetVersion + * @return true if this codec can handle this version */ - protected abstract List parseFilters(String filterString); + public abstract boolean canDecodeVersion(final VCFHeaderVersion targetVersion); + + // TODO: Note: This method was lifted from duplicate methods in the codec subclasses. + /** + * Reads all of the header from the provided iterator, but reads no further. + * @param lineIterator the line reader to take header lines from + * @return The parsed header + */ + @Override + public Object readActualHeader(final LineIterator lineIterator) { + final List headerStrings = new ArrayList<>(); + + // Extract one line and retrieve the file format and version, which must be the first line, + // and then add it back into the headerLines. + final VCFHeaderVersion fileFormatVersion = readFormatVersionLine(lineIterator); + headerStrings.add(VCFHeader.METADATA_INDICATOR + fileFormatVersion.getVersionLine()); + + // collect metadata lines until we hit the required header line, or a non-metadata line, + // in which case throw since there was no header line + // TODO: Optimization: There is no reason we couldn't just parse the header lines right here + // instead of accumulating them in a list and then making another pass to convert them + while (lineIterator.hasNext()) { + final String line = lineIterator.next(); + if (line.startsWith(VCFHeader.HEADER_INDICATOR)) { + lineNo++; + headerStrings.add(line); + if (!line.startsWith(VCFHeader.METADATA_INDICATOR)) { + this.header = parseHeaderFromLines(headerStrings, fileFormatVersion); + break; + } + } else { + throw new TribbleException.InvalidHeader( + "The required header line (starting with one #) is missing in the input VCF file"); + } + } + return this.header; + } + + /** + * Read ahead one line to obtain and return the vcf header version for this file + * + * @param headerLineIterator + * @return VCFHeaderVersion for this file + * @throws TribbleException if no file format header line is found in the first line or, the version can't + * be handled by this codec + */ + protected VCFHeaderVersion readFormatVersionLine(final LineIterator headerLineIterator) { + if (headerLineIterator.hasNext()) { + final String headerVersionLine = headerLineIterator.next(); + if (headerVersionLine.startsWith(VCFHeader.METADATA_INDICATOR)) { + final VCFHeaderVersion vcfFileVersion = VCFHeaderVersion.getHeaderVersion(headerVersionLine); + if (!canDecodeVersion(vcfFileVersion)) { + throw new TribbleException.InvalidHeader( + String.format("The \"(%s)\" codec does not support VCF version: %s", getName(), vcfFileVersion)); + } else { + return vcfFileVersion; + } + } + } + throw new TribbleException.InvalidHeader("The VCF version header line is missing"); + } /** * create a VCF header from a set of header record lines @@ -146,100 +207,256 @@ public LazyGenotypesContext.LazyData parse(final Object data) { * @param headerStrings a list of strings that represent all the ## and # entries * @return a VCFHeader object */ - protected VCFHeader parseHeaderFromLines( final List headerStrings, final VCFHeaderVersion version ) { - this.version = version; + protected VCFHeader parseHeaderFromLines( final List headerStrings, final VCFHeaderVersion sourceVersion ) { + this.version = sourceVersion; - Set metaData = new LinkedHashSet(); - Set sampleNames = new LinkedHashSet(); + final Set metaData = new LinkedHashSet<>(); + Set sampleNames = new LinkedHashSet<>(); int contigCounter = 0; - // iterate over all the passed in strings - for ( String str : headerStrings ) { - if ( !str.startsWith(VCFHeader.METADATA_INDICATOR) ) { - String[] strings = str.substring(1).split(VCFConstants.FIELD_SEPARATOR); - if ( strings.length < VCFHeader.HEADER_FIELDS.values().length ) - throw new TribbleException.InvalidHeader("there are not enough columns present in the header line: " + str); - - int arrayIndex = 0; - for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) { - try { - if (field != VCFHeader.HEADER_FIELDS.valueOf(strings[arrayIndex])) - throw new TribbleException.InvalidHeader("we were expecting column name '" + field + "' but we saw '" + strings[arrayIndex] + "'"); - } catch (IllegalArgumentException e) { - throw new TribbleException.InvalidHeader("unknown column name '" + strings[arrayIndex] + "'; it does not match a legal column header name."); - } - arrayIndex++; - } - boolean sawFormatTag = false; - if ( arrayIndex < strings.length ) { - if ( !strings[arrayIndex].equals("FORMAT") ) - throw new TribbleException.InvalidHeader("we were expecting column name 'FORMAT' but we saw '" + strings[arrayIndex] + "'"); - sawFormatTag = true; - arrayIndex++; + for ( String headerLine : headerStrings ) { + if ( !headerLine.startsWith(VCFHeader.METADATA_INDICATOR) ) { + sampleNames = parsePrimaryHeaderLine(headerLine); + } else { + if ( headerLine.startsWith(VCFConstants.INFO_HEADER_START) ) { + metaData.add(getInfoHeaderLine(headerLine.substring(INFO_HEADER_OFFSET), sourceVersion)); + } else if ( headerLine.startsWith(VCFConstants.FILTER_HEADER_START) ) { + metaData.add(getFilterHeaderLine(headerLine.substring(FILTER_HEADER_OFFSET), sourceVersion)); + } else if ( headerLine.startsWith(VCFConstants.FORMAT_HEADER_START) ) { + metaData.add(getFormatHeaderLine(headerLine.substring(FORMAT_HEADER_OFFSET), sourceVersion)); + } else if ( headerLine.startsWith(VCFConstants.CONTIG_HEADER_START) ) { + metaData.add(getContigHeaderLine(headerLine.substring(CONTIG_HEADER_OFFSET), sourceVersion, contigCounter++)); + } else if ( headerLine.startsWith(VCFConstants.ALT_HEADER_START) ) { + metaData.add(getAltHeaderLine(headerLine.substring(ALT_HEADER_OFFSET), sourceVersion)); + } else if ( headerLine.startsWith(VCFConstants.PEDIGREE_HEADER_START) ) { + metaData.add(getPedigreeHeaderLine(headerLine.substring(PEDIGREE_HEADER_OFFSET), sourceVersion)); + } else if ( headerLine.startsWith(VCFConstants.META_HEADER_START) ) { + metaData.add(getMetaHeaderLine(headerLine.substring(META_HEADER_OFFSET), sourceVersion)); + } else if ( headerLine.startsWith(VCFConstants.SAMPLE_HEADER_START) ) { + metaData.add(getSampleHeaderLine(headerLine.substring(SAMPLE_HEADER_OFFSET), sourceVersion)); + } else { + VCFHeaderLine otherHeaderLine = getOtherHeaderLine( + headerLine.substring(VCFHeader.METADATA_INDICATOR.length()), + sourceVersion); + if (otherHeaderLine != null) + metaData.add(otherHeaderLine); } + } + } - while ( arrayIndex < strings.length ) - sampleNames.add(strings[arrayIndex++]); + this.header = new VCFHeader(sourceVersion, metaData, sampleNames); - if ( sawFormatTag && sampleNames.isEmpty()) - throw new TribbleException.InvalidHeader("The FORMAT field was provided but there is no genotype/sample data"); + if ( doOnTheFlyModifications ) { + this.header = VCFStandardHeaderLines.repairStandardHeaderLines(this.header); + } + return this.header; + } - // If we're performing sample name remapping and there is exactly one sample specified in the header, replace - // it with the remappedSampleName. Throw an error if there are 0 or multiple samples and remapping was requested - // for this file. - if ( remappedSampleName != null ) { - // We currently only support on-the-fly sample name remapping for single-sample VCFs - if ( sampleNames.isEmpty() || sampleNames.size() > 1 ) { - throw new TribbleException(String.format("Cannot remap sample name to %s because %s samples are specified in the VCF header, and on-the-fly sample name remapping is only supported for single-sample VCFs", - remappedSampleName, sampleNames.isEmpty() ? "no" : "multiple")); - } + /** + * Create and return a VCFInfoHeader object from a header line string that conforms to the {@code sourceVersion} + * @param headerLineString VCF header line being parsed without the leading "##" + * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header + * line object should be validate for this header version. + * @return a VCFInfoHeaderLine object + */ + public VCFInfoHeaderLine getInfoHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + return new VCFInfoHeaderLine(headerLineString, sourceVersion); + } - sampleNames.clear(); - sampleNames.add(remappedSampleName); - } + /** + * Create and return a VCFFormatHeader object from a header line string that conforms to the {@code sourceVersion} + * @param headerLineString VCF header line being parsed without the leading "##" + * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header + * line object should be validate for this header version. + * @return a VCFFormatHeaderLine object + */ + public VCFFormatHeaderLine getFormatHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + return new VCFFormatHeaderLine(headerLineString, sourceVersion); + } + + /** + * Create and return a VCFFilterHeaderLine object from a header line string that conforms to the {@code sourceVersion} + * @param headerLineString VCF header line being parsed without the leading "##" + * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header + * line object should be validate for this header version. + * @return a VCFFilterHeaderLine object + */ + public VCFFilterHeaderLine getFilterHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + return new VCFFilterHeaderLine(headerLineString, sourceVersion); + } + + /** + * Create and return a VCFContigHeaderLine object from a header line string that conforms to the {@code sourceVersion} + * @param headerLineString VCF header line being parsed without the leading "##" + * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header + * line object should be valid for this header version. + * @return a VCFContigHeaderLine object + */ + public VCFContigHeaderLine getContigHeaderLine( + final String headerLineString, + final VCFHeaderVersion sourceVersion, + final int contigIndex) { + return new VCFContigHeaderLine(headerLineString, sourceVersion, contigIndex); + } + /** + * Create and return a VCFAltHeaderLine object from a header line string that conforms to the {@code sourceVersion} + * @param headerLineString VCF header line being parsed without the leading "##" + * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header + * line object should be validate for this header version. + * @return a VCFAltHeaderLine object + */ + public VCFAltHeaderLine getAltHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + return new VCFAltHeaderLine(headerLineString, sourceVersion); + } + + /** + * Create and return a VCFPedigreeHeaderLine object from a header line string that conforms to the {@code sourceVersion} + * @param headerLineString VCF header line being parsed without the leading "##" + * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header + * line object should be validate for this header version. + * @return a VCFPedigreeHeaderLine object + */ + public VCFPedigreeHeaderLine getPedigreeHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + return new VCFPedigreeHeaderLine(headerLineString, sourceVersion); + } + + /** + * Create and return a VCFMetaHeaderLine object from a header line string that conforms to the {@code sourceVersion} + * @param headerLineString VCF header line being parsed without the leading "##" + * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header + * line object should be validate for this header version. + * @return a VCFMetaHeaderLine object + */ + public VCFMetaHeaderLine getMetaHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + return new VCFMetaHeaderLine(headerLineString, sourceVersion); + } + + /** + * Create and return a VCFSampleHeaderLine object from a header line string that conforms to the {@code sourceVersion} + * @param headerLineString VCF header line being parsed without the leading "##" + * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header + * line object should be validate for this header version. + * @return a VCFSampleHeaderLine object + */ + public VCFSampleHeaderLine getSampleHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + return new VCFSampleHeaderLine(headerLineString, sourceVersion); + } + + /** + * Create and return a basic VCFHeaderLine. + * + * @param headerLineString VCF header line being parsed without the leading "##" + * @param targetVersion VCFHeaderVersion being parsed + * @return a VCFHeaderLine + */ + public VCFHeaderLine getOtherHeaderLine(final String headerLineString, final VCFHeaderVersion targetVersion) { + final int indexOfEquals = headerLineString.indexOf('='); + if ( indexOfEquals < 3 ) { // must at least have "##?=" + // TODO: NOTE: the old code silently dropped metadata lines with no "="; now we log, or throw for verbose logging + if (VCFUtils.getStrictVCFVersionValidation()) { + throw new TribbleException.InvalidHeader("Unrecognized metadata line type: " + headerLineString); + } + if (VCFUtils.getVerboseVCFLogging()) { + logger.warn("Dropping unrecognized metadata line type: " + headerLineString); + } + return null; + } else { + final String headerLineValue = headerLineString.substring(indexOfEquals + 1); + if (headerLineValue.startsWith("<") && headerLineValue.endsWith(">")) { + return new VCFStructuredHeaderLine( + headerLineString.substring(0, indexOfEquals), + headerLineString.substring(indexOfEquals + 1), + targetVersion); } else { - if ( str.startsWith(VCFConstants.INFO_HEADER_START) ) { - final VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7), version); - metaData.add(info); - } else if ( str.startsWith(VCFConstants.FILTER_HEADER_START) ) { - final VCFFilterHeaderLine filter = new VCFFilterHeaderLine(str.substring(9), version); - metaData.add(filter); - } else if ( str.startsWith(VCFConstants.FORMAT_HEADER_START) ) { - final VCFFormatHeaderLine format = new VCFFormatHeaderLine(str.substring(9), version); - metaData.add(format); - } else if ( str.startsWith(VCFConstants.CONTIG_HEADER_START) ) { - final VCFContigHeaderLine contig = new VCFContigHeaderLine(str.substring(9), version, VCFConstants.CONTIG_HEADER_START.substring(2), contigCounter++); - metaData.add(contig); - } else if ( str.startsWith(VCFConstants.ALT_HEADER_START) ) { - final VCFSimpleHeaderLine alt = new VCFSimpleHeaderLine(str.substring(6), version, VCFConstants.ALT_HEADER_START.substring(2), Arrays.asList("ID", "Description")); - metaData.add(alt); - } else { - int equals = str.indexOf('='); - if ( equals != -1 ) - metaData.add(new VCFHeaderLine(str.substring(2, equals), str.substring(equals+1))); + return new VCFHeaderLine(headerLineString.substring(0, indexOfEquals), headerLineString.substring(indexOfEquals + 1)); + } + } + } + + // Parse the primary header line of the form: + // + // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ... + // + // The string passed in is the first non-metadata line we've seen, so it should conform. + // + private Set parsePrimaryHeaderLine(final String headerLine) { + final Set sampleNames = new LinkedHashSet<>(); + + final String[] columns = headerLine.substring(1).split(VCFConstants.FIELD_SEPARATOR); + if ( columns.length < VCFHeader.HEADER_FIELDS.values().length ) { + throw new TribbleException.InvalidHeader("not enough columns present in header line: " + headerLine); + } + + int col = 0; + for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) { + try { + if (field != VCFHeader.HEADER_FIELDS.valueOf(columns[col])) { + throw new TribbleException.InvalidHeader("expected column headerLineID '" + field + "' but saw '" + columns[col] + "'"); } + } catch (IllegalArgumentException e) { + throw new TribbleException.InvalidHeader("column headerLineID '" + columns[col] + "' is not a legal column header headerLineID."); } + col++; } - this.header = new VCFHeader(metaData, sampleNames); - if ( doOnTheFlyModifications ) - this.header = VCFStandardHeaderLines.repairStandardHeaderLines(this.header); - return this.header; + boolean sawFormatTag = false; + if ( col < columns.length ) { + if ( !columns[col].equals("FORMAT") ) + throw new TribbleException.InvalidHeader("expected column headerLineID 'FORMAT' but saw '" + columns[col] + "'"); + sawFormatTag = true; + col++; + } + + while ( col < columns.length ) { + sampleNames.add(columns[col++]); + } + + if ( sawFormatTag && sampleNames.isEmpty()) + throw new TribbleException.InvalidHeader("The FORMAT field was provided but there is no genotype/sample data"); + + // If we're performing sample name remapping and there is exactly one sample specified in the header, replace + // it with the remappedSampleName. Throw an error if there are 0 or multiple samples and remapping was requested + // for this file. + if ( remappedSampleName != null ) { + // We currently only support on-the-fly sample name remapping for single-sample VCFs + if ( sampleNames.isEmpty() || sampleNames.size() > 1 ) { + throw new TribbleException( + String.format("Cannot remap sample headerLineID to %s because %s samples are specified in the VCF header, " + + "and on-the-fly sample headerLineID remapping is only supported for single-sample VCFs", + remappedSampleName, sampleNames.isEmpty() ? "no" : "multiple")); + } + + sampleNames.clear(); + sampleNames.add(remappedSampleName); + } + + return sampleNames; } - /** + /** * Explicitly set the VCFHeader on this codec. This will overwrite the header read from the file * and the version state stored in this instance; conversely, reading the header from a file will - * overwrite whatever is set here. The returned header may not be identical to the header argument - * since the header lines may be "repaired" (i.e., rewritten) if doOnTheFlyModifications is set. + * overwrite whatever is set here. + * + * The returned header may not be identical to, or may even be a complete replacement for, the + * input header argument, since the header lines may be "repaired" (i.e., rewritten) if + * doOnTheFlyModifications is set. */ - public VCFHeader setVCFHeader(final VCFHeader header, final VCFHeaderVersion version) { - this.version = version; + public VCFHeader setVCFHeader(final VCFHeader newHeader, final VCFHeaderVersion newVersion) { + Utils.nonNull(newHeader); + Utils.nonNull(newVersion); - if (this.doOnTheFlyModifications) this.header = VCFStandardHeaderLines.repairStandardHeaderLines(header); - else this.header = header; + // force header validation for the target version; hopefully this isn't actually + // setting a new version on the header + newHeader.setHeaderVersion(newVersion); + this.header = newHeader; + this.version = newVersion; + + if (this.doOnTheFlyModifications) { + this.header = VCFStandardHeaderLines.repairStandardHeaderLines(newHeader); + } return this.header; } @@ -321,16 +538,19 @@ else if ( parts[2].equals(VCFConstants.EMPTY_ID_FIELD) ) final String alts = getCachedString(parts[4]); builder.log10PError(parseQual(parts[5])); - final List filters = parseFilters(getCachedString(parts[6])); - if ( filters != null ) builder.filters(new HashSet(filters)); + final Set filters = parseFilters(getCachedString(parts[6])); + if ( filters != null ) { + builder.filters(new HashSet<>(filters)); + } final Map attrs = parseInfo(parts[7]); builder.attributes(attrs); - if ( attrs.containsKey(VCFConstants.END_KEY) ) { - // update stop with the end key if provided + // update stop with the end key if provided + Object endValue= attrs.get(VCFConstants.END_KEY); + if ( endValue != null ) { try { - builder.stop(Integer.valueOf(attrs.get(VCFConstants.END_KEY).toString())); - } catch (Exception e) { + builder.stop(Integer.valueOf(endValue.toString())); + } catch (NumberFormatException e) { generateException("the END value in the INFO field is not valid"); } } else { @@ -397,20 +617,67 @@ protected String getCachedString(String str) { return internedString; } + // TODO: Note: This method was lifted from duplicate methods in the codec subclasses. + /** + * parse the filter string, first checking to see if we already have parsed it in a previous attempt + * @param filterString the string to parse + * @return a set of the filters applied + */ + protected Set parseFilters(final String filterString) { + // null for unfiltered + if ( filterString.equals(VCFConstants.UNFILTERED) ) + return null; + + if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) + return Collections.emptySet(); + if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) + generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter headerLineID in vcf4", lineNo); + if (filterString.isEmpty()) + generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo); + + // do we have the filter string cached? + if ( filterHash.containsKey(filterString) ) + return filterHash.get(filterString); + + // empty set for passes filters + final Set fFields = new HashSet<>(); + // otherwise we have to parse and cache the value + if ( !filterString.contains(VCFConstants.FILTER_CODE_SEPARATOR) ) + fFields.add(filterString); + else { + // Variant context uses a Set to store these, so duplicates were getting dropped anyway + // in previous versions. Warn for old version; throw for V43+. + String[] filters = filterString.split(VCFConstants.FILTER_CODE_SEPARATOR); + for (int i = 0; i < filters.length; i++) { + if (!fFields.add(filters[i])) { + String message = String.format( + "Filters must be unique; filter field \"%s\" in the vicinity of " + + "line %d has duplicate filters", filterString, lineNo); + reportDuplicateFilterIDs(message); + } + } + } + + filterHash.put(filterString, Collections.unmodifiableSet(fFields)); + + return fFields; + } + /** * parse out the info fields * @param infoField the fields * @return a mapping of keys to objects */ - private Map parseInfo(String infoField) { - Map attributes = new HashMap(); + protected Map parseInfo(String infoField) { + Map attributes = new HashMap<>(); if ( infoField.isEmpty() ) generateException("The VCF specification requires a valid (non-zero length) info field"); if ( !infoField.equals(VCFConstants.EMPTY_INFO_FIELD) ) { - if ( infoField.indexOf('\t') != -1 || infoField.indexOf(' ') != -1 ) - generateException("The VCF specification does not allow for whitespace in the INFO field. Offending field value was \"" + infoField + "\""); + if ( infoField.indexOf('\t') != -1 ) { + generateException("The VCF specification does not allow for tab characters in the INFO field. Offending field value was \"" + infoField + "\""); + } List infoFields = ParsingUtils.split(infoField, VCFConstants.INFO_FIELD_SEPARATOR_CHAR); for (int i = 0; i < infoFields.size(); i++) { @@ -438,8 +705,8 @@ private Map parseInfo(String infoField) { key = infoFields.get(i); final VCFInfoHeaderLine headerLine = header.getInfoHeaderLine(key); if ( headerLine != null && headerLine.getType() != VCFHeaderLineType.Flag ) { - if ( GeneralUtils.DEBUG_MODE_ENABLED && ! warnedAboutNoEqualsForNonFlag ) { - System.err.println("Found info key " + key + " without a = value, but the header says the field is of type " + if ( warnedAboutNoEqualsForNonFlag ) { + logger.warn("Found info key " + key + " without a = value, but the header says the field is of type " + headerLine.getType() + " but this construct is only value for FLAG type fields"); warnedAboutNoEqualsForNonFlag = true; } @@ -453,6 +720,10 @@ private Map parseInfo(String infoField) { // this line ensures that key/value pairs that look like key=; are parsed correctly as MISSING if ( "".equals(value) ) value = VCFConstants.MISSING_VALUE_v4; + if (attributes.containsKey(key)) { + reportDuplicateInfoKeyValue(key, infoField); + } + attributes.put(key, value); } } @@ -460,6 +731,19 @@ private Map parseInfo(String infoField) { return attributes; } + /** + * Handle reporting of duplicate filter IDs + * @param duplicateFilterMessage + */ + protected abstract void reportDuplicateFilterIDs(final String duplicateFilterMessage); + + /** + * Handle report of duplicate info line field values + * @param key + * @param infoLine + */ + public abstract void reportDuplicateInfoKeyValue(final String key, final String infoLine); + /** * create a an allele from an index and an array of alleles * @param index the index @@ -620,6 +904,9 @@ private static void parseSingleAltAllele(List alleles, String alt, int l alleles.add(allele); } + // TODO: What is the intended meaning of a return value of true ? This class is abstract and can't + // decode anything directly, but it will return true for ANY 4.x file when passed a string with + // the prefix "##fileformat=VCFv4" (or worse, for any vcf file if passed "##fileformat=VCFv") public static boolean canDecodeFile(final String potentialInput, final String MAGIC_HEADER_LINE) { try { Path path = IOUtil.getPath(potentialInput); @@ -704,8 +991,8 @@ public LazyGenotypesContext.LazyData createGenotypeMap(final String str, } else if ( missing ) { // if its truly missing (there no provided value) skip adding it to the attributes } else if (gtKey.equals(VCFConstants.GENOTYPE_FILTER_KEY)) { - final List filters = parseFilters(getCachedString(genotypeValues.get(i))); - if ( filters != null ) gb.filters(filters); + final Set filters = parseFilters(getCachedString(genotypeValues.get(i))); + if ( filters != null ) gb.filters(new ArrayList<>(filters)); } else if ( genotypeValues.get(i).equals(VCFConstants.MISSING_VALUE_v4) ) { // don't add missing values to the map } else { @@ -784,11 +1071,11 @@ public void setRemappedSampleName( final String remappedSampleName ) { } protected void generateException(String message) { - throw new TribbleException(String.format("The provided VCF file is malformed at approximately line number %d: %s", lineNo, message)); + throw new TribbleException(String.format("Failure parsing VCF file at (approximately) line number %d: %s", lineNo, message)); } protected static void generateException(String message, int lineNo) { - throw new TribbleException(String.format("The provided VCF file is malformed at approximately line number %d: %s", lineNo, message)); + throw new TribbleException(String.format("Failure parsing VCF file at (approximately) line number %d: %s", lineNo, message)); } @Override diff --git a/src/main/java/htsjdk/variant/vcf/VCF3Codec.java b/src/main/java/htsjdk/variant/vcf/VCF3Codec.java index e9ca3abdf7..355628f771 100644 --- a/src/main/java/htsjdk/variant/vcf/VCF3Codec.java +++ b/src/main/java/htsjdk/variant/vcf/VCF3Codec.java @@ -25,12 +25,9 @@ package htsjdk.variant.vcf; -import htsjdk.tribble.TribbleException; -import htsjdk.tribble.readers.LineIterator; - -import java.util.ArrayList; import java.util.Arrays; -import java.util.List; +import java.util.HashSet; +import java.util.Set; /** @@ -52,46 +49,28 @@ public class VCF3Codec extends AbstractVCFCodec { public final static String VCF3_MAGIC_HEADER = "##fileformat=VCFv3"; + public VCF3Codec() { + // TODO: This defaults to "Unknown" and winds up in every VariantContext. Setting it + // here breaks some GATK4 tests. Should we put useful something here ? + //setName(String.format("htsjdk:%s:%s", + // VCFHeaderVersion.VCF3_2.getVersionString(), + // VCFHeaderVersion.VCF3_3.getVersionString())); + } + /** - * @param reader the line reader to take header lines from - * @return the number of header lines + * Return true if this codec can handle the target version + * @param targetHeaderVersion + * @return true if this codec can handle this version */ @Override - public Object readActualHeader(final LineIterator reader) { - final List headerStrings = new ArrayList(); - - VCFHeaderVersion version = null; - boolean foundHeaderVersion = false; - while (reader.hasNext()) { - lineNo++; - final String line = reader.peek(); - if (line.startsWith(VCFHeader.METADATA_INDICATOR)) { - final String[] lineFields = line.substring(2).split("="); - if (lineFields.length == 2 && VCFHeaderVersion.isFormatString(lineFields[0]) ) { - if ( !VCFHeaderVersion.isVersionString(lineFields[1]) ) - throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version"); - foundHeaderVersion = true; - version = VCFHeaderVersion.toHeaderVersion(lineFields[1]); - if ( version != VCFHeaderVersion.VCF3_3 && version != VCFHeaderVersion.VCF3_2 ) - throw new TribbleException.InvalidHeader("This codec is strictly for VCFv3 and does not support " + lineFields[1]); - } - headerStrings.add(reader.next()); - } - else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) { - if (!foundHeaderVersion) { - throw new TribbleException.InvalidHeader("We never saw a header line specifying VCF version"); - } - headerStrings.add(reader.next()); - return super.parseHeaderFromLines(headerStrings, version); - } - else { - throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); - } - - } - throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); + public boolean canDecodeVersion(final VCFHeaderVersion targetHeaderVersion) { + return targetHeaderVersion == VCFHeaderVersion.VCF3_3 || targetHeaderVersion == VCFHeaderVersion.VCF3_2; } + @Override + public boolean canDecode(final String potentialInputFile) { + return canDecodeFile(potentialInputFile, VCF3_MAGIC_HEADER); + } /** * parse the filter string, first checking to see if we already have parsed it in a previous attempt @@ -99,24 +78,24 @@ else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) { * @return a set of the filters applied */ @Override - protected List parseFilters(String filterString) { + protected Set parseFilters(String filterString) { // null for unfiltered if ( filterString.equals(VCFConstants.UNFILTERED) ) return null; // empty set for passes filters - List fFields = new ArrayList(); + HashSet fFields = new HashSet<>(); if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) - return new ArrayList(fFields); + return new HashSet<>(fFields); if (filterString.isEmpty()) generateException("The VCF specification requires a valid filter status"); // do we have the filter string cached? if ( filterHash.containsKey(filterString) ) - return new ArrayList(filterHash.get(filterString)); + return new HashSet<>(filterHash.get(filterString)); // otherwise we have to parse and cache the value if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 ) @@ -129,8 +108,20 @@ protected List parseFilters(String filterString) { return fFields; } + /** + * Handle reporting of duplicate filter IDs + * @param duplicateFilterMessage + */ @Override - public boolean canDecode(final String potentialInput) { - return canDecodeFile(potentialInput, VCF3_MAGIC_HEADER); + protected void reportDuplicateFilterIDs(final String duplicateFilterMessage) { + // no-op since this codec's parseFilters method doesn't check for them } + + /** + * Handle report of duplicate info field values + * @param key + * @param infoLine + */ + public void reportDuplicateInfoKeyValue(final String key, final String infoLine) {} + } diff --git a/src/main/java/htsjdk/variant/vcf/VCFAltHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFAltHeaderLine.java new file mode 100644 index 0000000000..54904f6ac2 --- /dev/null +++ b/src/main/java/htsjdk/variant/vcf/VCFAltHeaderLine.java @@ -0,0 +1,75 @@ +package htsjdk.variant.vcf; + +import htsjdk.samtools.util.Log; +import htsjdk.tribble.TribbleException; + +import java.util.*; + +//TODO: Should we validate these alt allele types ? +// Structural Variants +// In symbolic alternate alleles for imprecise structural variants, the ID field indicates the type of structural variant, +// and can be a colon-separated list of types and subtypes. ID values are case sensitive strings and must not contain +// whitespace or angle brackets. The first level type must be one of the following: +// DEL Deletion relative to the reference +// INS Insertion of novel sequence relative to the reference +// DUP Region of elevated copy number relative to the reference +// INV Inversion of reference sequence +// CNV Copy number variable region (may be both deletion and duplication) +// The CNV category should not be used when a more specific category can be applied. Reserved subtypes include: +// DUP:TANDEM Tandem duplication +// DEL:ME Deletion of mobile element relative to the reference +// INS:ME Insertion of a mobile element relative to the reference +// IUPAC ambiguity codes +// Symbolic alleles can be used also to represent genuinely ambiguous data in VCF, for example: +// ##ALT= +// ##ALT= + +/** + * A class representing ALT fields in the VCF header + */ +public class VCFAltHeaderLine extends VCFStructuredHeaderLine { + private static final long serialVersionUID = 1L; + protected final static Log logger = Log.getInstance(VCFHeader.class); + + private static List expectedTags = Collections.unmodifiableList( + new ArrayList(2) {{ + add(ID_ATTRIBUTE); + add(DESCRIPTION_ATTRIBUTE); + }} + ); + + public VCFAltHeaderLine(final String line, final VCFHeaderVersion version) { + // We need to call the V4 parser directly since the V3 parser requires expected tags; validateForVersion + // will detect the version incompatibility if we're called on behalf of V3 + super(VCFConstants.ALT_HEADER_KEY, new VCF4Parser().parseLine(line, expectedTags)); + validateForVersion(version); + } + + public VCFAltHeaderLine(final String id, final String description) { + super(VCFConstants.ALT_HEADER_KEY, + new LinkedHashMap() {{ + put(ID_ATTRIBUTE, id); + put(DESCRIPTION_ATTRIBUTE, description); + }} + ); + } + + /** + * Validate that this header line conforms to the target version. + */ + @Override + public void validateForVersion(final VCFHeaderVersion vcfTargetVersion) { + super.validateForVersion(vcfTargetVersion); + //TODO: NOTE: should we have this V4.0 threshold ? + if (!vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0)) { + final String message = String.format("%s header lines are not allowed in VCF version %s headers", + getKey(), + vcfTargetVersion.toString()); + if (VCFUtils.getStrictVCFVersionValidation()) { + throw new TribbleException.InvalidHeader(message); + } else { + logger.warn(message); + } + } + } +} diff --git a/src/main/java/htsjdk/variant/vcf/VCFCodec.java b/src/main/java/htsjdk/variant/vcf/VCFCodec.java index 6e5d3b7d2e..0b98316a2c 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFCodec.java +++ b/src/main/java/htsjdk/variant/vcf/VCFCodec.java @@ -1,6 +1,6 @@ /* * Copyright (c) 2012 The Broad Institute -* +* * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without @@ -9,10 +9,10 @@ * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: -* +* * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. -* +* * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND @@ -25,17 +25,10 @@ package htsjdk.variant.vcf; -import htsjdk.tribble.TribbleException; -import htsjdk.tribble.readers.LineIterator; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.LinkedList; -import java.util.List; +import java.util.*; /** - * A feature codec for the VCF 4 specification + * A feature codec for the VCF 4.0, 4.1 and 4.2 specification versions * *

* VCF is a text file format (most likely stored in a compressed manner). It contains meta-information lines, a @@ -45,7 +38,7 @@ * of related samples. Recently the format for storing next-generation read alignments has been * standardised by the SAM/BAM file format specification. This has significantly improved the * interoperability of next-generation tools for alignment, visualisation, and variant calling. - * We propose the Variant Call Format (VCF) as a standarised format for storing the most prevalent + * We propose the Variant Call Format (VCF) as a standardised format for storing the most prevalent * types of sequence variation, including SNPs, indels and larger structural variants, together * with rich annotations. VCF is usually stored in a compressed manner and can be indexed for * fast data retrieval of variants from a range of positions on the reference genome. @@ -72,91 +65,68 @@ * @since 2010 */ public class VCFCodec extends AbstractVCFCodec { - // Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters. + // Our aim is to read in the records and convert to VariantContext as quickly as possible, relying + // on VariantContext to do the validation of any contradictory (or malformed) record parameters. public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4"; + public VCFCodec() { + // TODO: This defaults to "Unknown" and winds up in every VariantContext. Setting it + // here breaks some GATK4 tests. Should we put useful something here ? + //setName(String.format("%s:%s:%s", + // VCFHeaderVersion.VCF4_0.getVersionString(), + // VCFHeaderVersion.VCF4_1.getVersionString(), + // VCFHeaderVersion.VCF4_2.getVersionString()) + //); + } + /** - * Reads all of the header from the provided iterator, but no reads no further. - * @param lineIterator the line reader to take header lines from - * @return The parsed header + * Return true if this codec can handle the target version + * @param targetHeaderVersion + * @return true if this codec can handle this version */ @Override - public Object readActualHeader(final LineIterator lineIterator) { - final List headerStrings = new ArrayList(); - - String line; - boolean foundHeaderVersion = false; - while (lineIterator.hasNext()) { - line = lineIterator.peek(); - lineNo++; - if (line.startsWith(VCFHeader.METADATA_INDICATOR)) { - final String[] lineFields = line.substring(2).split("="); - if (lineFields.length == 2 && VCFHeaderVersion.isFormatString(lineFields[0]) ) { - if ( !VCFHeaderVersion.isVersionString(lineFields[1]) ) - throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version"); - foundHeaderVersion = true; - version = VCFHeaderVersion.toHeaderVersion(lineFields[1]); - if ( ! version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) ) - throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4; please use the VCF3 codec for " + lineFields[1]); - if ( version != VCFHeaderVersion.VCF4_0 && version != VCFHeaderVersion.VCF4_1 && version != VCFHeaderVersion.VCF4_2 ) - throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4 and does not support " + lineFields[1]); - } - headerStrings.add(lineIterator.next()); - } - else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) { - if (!foundHeaderVersion) { - throw new TribbleException.InvalidHeader("We never saw a header line specifying VCF version"); - } - headerStrings.add(lineIterator.next()); - super.parseHeaderFromLines(headerStrings, version); - return this.header; - } - else { - throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); - } + public boolean canDecodeVersion(final VCFHeaderVersion targetHeaderVersion) { + return targetHeaderVersion == VCFHeaderVersion.VCF4_0 || + targetHeaderVersion == VCFHeaderVersion.VCF4_1 || + targetHeaderVersion == VCFHeaderVersion.VCF4_2; + } - } - throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); + @Override + public boolean canDecode(final String potentialInput) { + // TODO: this will succeed on 4.3 files since it only looks as far as ..."##fileformat=VCFv4" + return canDecodeFile(potentialInput, VCF4_MAGIC_HEADER); } /** - * parse the filter string, first checking to see if we already have parsed it in a previous attempt - * - * @param filterString the string to parse - * @return a set of the filters applied or null if filters were not applied to the record (e.g. as per the missing value in a VCF) + * Handle reporting of duplicate filter IDs + * @param duplicateFilterMessage */ @Override - protected List parseFilters(final String filterString) { - // null for unfiltered - if ( filterString.equals(VCFConstants.UNFILTERED) ) - return null; - - if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) - return Collections.emptyList(); - if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) - generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4", lineNo); - if (filterString.isEmpty()) - generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo); - - // do we have the filter string cached? - if ( filterHash.containsKey(filterString) ) - return filterHash.get(filterString); - - // empty set for passes filters - final List fFields = new LinkedList(); - // otherwise we have to parse and cache the value - if ( !filterString.contains(VCFConstants.FILTER_CODE_SEPARATOR) ) - fFields.add(filterString); - else - fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR))); + protected void reportDuplicateFilterIDs(final String duplicateFilterMessage) { + // older versions of htsjdk have been silently dropping these for a while, but we can at least warn + if (VCFUtils.getVerboseVCFLogging()) { + logger.warn(duplicateFilterMessage); + } + } - filterHash.put(filterString, Collections.unmodifiableList(fFields)); + public void reportDuplicateInfoKeyValue(final String key, final String infoLine) {} - return fFields; + /** + * parse out the info fields + * @param infoField the fields + * @return a mapping of keys to objects + */ + protected Map parseInfo(String infoField) { + if (infoField.indexOf(' ') != -1) { + generateException( + String.format("Whitespace is not allowed in the INFO field in VCF version %s: %s", + version == null ? + "unknown" : + version.getVersionString(), + infoField) + ); + } + return super.parseInfo(infoField); } - @Override - public boolean canDecode(final String potentialInput) { - return canDecodeFile(potentialInput, VCF4_MAGIC_HEADER); - } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java index 4d8c3447fc..1395f52e80 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java @@ -25,50 +25,206 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.util.Log; import htsjdk.tribble.TribbleException; -import htsjdk.variant.utils.GeneralUtils; +import htsjdk.utils.Utils; import htsjdk.variant.variantcontext.GenotypeLikelihoods; import htsjdk.variant.variantcontext.VariantContext; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.LinkedHashMap; -import java.util.Map; +import java.util.*; +import java.util.function.BiFunction; +import java.util.regex.Pattern; + /** - * a base class for compound header lines, which include info lines and format lines (so far) + * Abstract base class for compound header lines, which include INFO lines and FORMAT lines. + * + * Compound header lines are distinguished only in that are required to have TYPE and NUMBER attributes + * (VCFHeaderLineCount, a VCFHeaderLineType, and a count). */ -public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCFIDHeaderLine { +public abstract class VCFCompoundHeaderLine extends VCFStructuredHeaderLine { + private static final long serialVersionUID = 1L; + protected static final Log logger = Log.getInstance(VCFCompoundHeaderLine.class); - public enum SupportedHeaderLineType { - INFO(true), FORMAT(false); + // regex pattern corresponding to legal info/format field keys + protected static final Pattern VALID_HEADER_ID_PATTERN = Pattern.compile("^[A-Za-z_][0-9A-Za-z_.]*$"); + protected static final String UNBOUND_DESCRIPTION = "Not provided in original VCF header"; - public final boolean allowFlagValues; - SupportedHeaderLineType(boolean flagValues) { - allowFlagValues = flagValues; - } + protected static String NUMBER_ATTRIBUTE = "Number"; + protected static String TYPE_ATTRIBUTE = "Type"; + + // List of expected tags that have a predefined order (used by the parser to verify order only). The + // header line class itself should verify that all required tags are present. + protected static final List expectedTagOrder = Collections.unmodifiableList( + new ArrayList(4) {{ + add(ID_ATTRIBUTE); + add(NUMBER_ATTRIBUTE); + add(TYPE_ATTRIBUTE); + add(DESCRIPTION_ATTRIBUTE); + }} + ); + + // immutable, cached binary representations of compound header line attributes + private final VCFHeaderLineType type; + private final VCFHeaderLineCount countType; + private final int count; + + /** + * create a VCF compound header line with count type = VCFHeaderLineCount.INTEGER + * + * @param key the key (header line type) for this header line + * @param headerLineID the is or this header line + * @param count the count for this header line, sets countType type as VCFHeaderLineCount.INTEGER + * @param type the type for this header line + * @param description the description for this header line + */ + protected VCFCompoundHeaderLine( + final String key, + final String headerLineID, + final int count, + final VCFHeaderLineType type, + final String description) + { + this(key, createAttributeMap(headerLineID, VCFHeaderLineCount.INTEGER, count, type, description), VCFHeader.DEFAULT_VCF_VERSION); + } + + /** + * create a VCF compound header line + * + * @param key the key (header line type) for this header line + * @param headerLineID the id for this header line + * @param countType the count type for this header line + * @param type the type for this header line + * @param description the description for this header line + */ + protected VCFCompoundHeaderLine( + final String key, + final String headerLineID, + final VCFHeaderLineCount countType, + final VCFHeaderLineType type, + final String description) { + this(key, createAttributeMap(headerLineID, countType, VCFHeaderLineCount.VARIABLE_COUNT, type, description), VCFHeader.DEFAULT_VCF_VERSION); } - // the field types - private String name; - private int count = -1; - private VCFHeaderLineCount countType; - private String description; - private VCFHeaderLineType type; + /** + * create a VCF compound header line from an attribute map + * + * @param key the key (header line type) for this header line + * @param mapping the header line attribute map + * @param vcfVersion the VCF header version. This may be null, in which case + */ + protected VCFCompoundHeaderLine(final String key, final Map mapping, final VCFHeaderVersion vcfVersion) { + super(key, mapping); + Utils.nonNull(vcfVersion); + + this.type = decodeLineType(getGenericFieldValue(TYPE_ATTRIBUTE)); + final String countString = getGenericFieldValue(NUMBER_ATTRIBUTE); + this.countType = decodeCountType(countString, vcfVersion); + this.count = decodeCount(countString, this.countType); + validateForVersion(vcfVersion); + } + + /** + * Return the description for this header line. + * @return + */ + public String getDescription() { + final String description = getGenericFieldValue(DESCRIPTION_ATTRIBUTE); + return description == null ? + UNBOUND_DESCRIPTION : + description; + } - // access methods - @Override - public String getID() { return name; } - public String getDescription() { return description; } public VCFHeaderLineType getType() { return type; } + public VCFHeaderLineCount getCountType() { return countType; } - public boolean isFixedCount() { return countType == VCFHeaderLineCount.INTEGER; } + public boolean isFixedCount() { return countType.isFixedCount(); } + public int getCount() { - if (!isFixedCount()) - throw new TribbleException("Asking for header line count when type is not an integer"); + if (!isFixedCount()) { + throw new TribbleException("Header line count request when count type is not an integer"); + } return count; } + private VCFHeaderLineType decodeLineType(final String lineTypeString) { + if (lineTypeString == null) { + throw new TribbleException(String.format("A line type attribute is required for %s header lines", getKey())); + } else { + try { + return VCFHeaderLineType.valueOf(lineTypeString); + } catch (IllegalArgumentException e) { + throw new TribbleException(String.format( + "\"%s\" is not a valid type for %s header lines (note that types are case-sensitive)", + lineTypeString, + getKey())); + } + } + } + + private VCFHeaderLineCount decodeCountType(final String countString, final VCFHeaderVersion vcfVersion) { + if (countString == null) { + throw new TribbleException.InvalidHeader( + String.format("A count type/value must be provided for %s header lines.", getID())); + } + return VCFHeaderLineCount.decode(vcfVersion, countString); + } + + private int decodeCount(final String countString, final VCFHeaderLineCount requestedCountType) { + int lineCount = VCFHeaderLineCount.VARIABLE_COUNT; + if (requestedCountType.isFixedCount()) { + if (countString == null) { + throw new TribbleException.InvalidHeader(String.format("Missing count value in VCF header field %s", getID())); + } + try { + lineCount = Integer.valueOf(countString); + } catch (NumberFormatException e) { + throw new TribbleException.InvalidHeader(String.format("Invalid count value %s in VCF header field %s", lineCount, getID())); + } + if (lineCount < 0) { + throw new TribbleException.InvalidHeader("Count < 0 for fixed size VCF header field " + getID()); + } + if (getType() == VCFHeaderLineType.Flag && lineCount != 0) { + // This check is here on behalf of INFO lines (which are the only header line type allowed to have Flag + // type). A Flag type with a count value other than 0 violates the spec (at least v4.2 and v4.3), but + // to retain backward compatibility with previous implementations, we accept (and repair) and the line here. + updateGenericField(NUMBER_ATTRIBUTE, "0"); + lineCount = 0; + if (VCFUtils.getVerboseVCFLogging()) { + String message = String.format("FLAG fields must have a count value of 0, but saw count %d for header line %s. A value of 0 will be used", + lineCount, + getID()); + logger.warn(message); + } + } + } + return lineCount; + } + + /** + * Called when an attempt is made to add this header line to a header that is know to have a specific + * version, or when an attempt is made to change the version of header by changing it's target version, + * to validate that the header line conforms to the target version requirements. + */ + public void validateForVersion(final VCFHeaderVersion vcfTargetVersion) { + super.validateForVersion(vcfTargetVersion); + if (!VALID_HEADER_ID_PATTERN.matcher(getID()).matches() ) { + String message = String.format("ID value \"%s\" in \"%s\" header line does not conform to VCF %s ID restrictions", + getID(), + getKey(), + getKey()); + if (vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { + if (VCFUtils.getStrictVCFVersionValidation()) { + throw new TribbleException.InvalidHeader(message); + } + if (VCFUtils.getVerboseVCFLogging()) { + // warn for older versions - this line can't be used as a v4.3 line + logger.warn(message); + } + } + } + } + /** * Get the number of values expected for this header field, given the properties of VariantContext vc * @@ -101,153 +257,79 @@ public int getCount(final VariantContext vc) { } } - public void setNumberToUnbounded() { - countType = VCFHeaderLineCount.UNBOUNDED; - count = -1; - } - - // our type of line, i.e. format, info, etc - private final SupportedHeaderLineType lineType; - - /** - * create a VCF format header line - * - * @param name the name for this header line - * @param count the count for this header line - * @param type the type for this header line - * @param description the description for this header line - * @param lineType the header line type - */ - protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) { - super(lineType.toString(), ""); - this.name = name; - this.countType = VCFHeaderLineCount.INTEGER; - this.count = count; - this.type = type; - this.description = description; - this.lineType = lineType; - validate(); - } - - /** - * create a VCF format header line - * - * @param name the name for this header line - * @param count the count type for this header line - * @param type the type for this header line - * @param description the description for this header line - * @param lineType the header line type - */ - protected VCFCompoundHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) { - super(lineType.toString(), ""); - this.name = name; - this.countType = count; - this.type = type; - this.description = description; - this.lineType = lineType; - validate(); + // Create a backing attribute map out of VCFCompoundHeaderLine elements + private static Map createAttributeMap( + final String headerLineID, + final VCFHeaderLineCount countType, + final int count, + final VCFHeaderLineType type, + final String description) { + return new LinkedHashMap() { + { put(ID_ATTRIBUTE, headerLineID); } + { put(NUMBER_ATTRIBUTE, countType.encode(count)); } + { put(TYPE_ATTRIBUTE, type.encode()); } + { + // Handle the case where there's no description provided, ALLOW_UNBOUND_DESCRIPTIONS is the default + // note: if no description was provided, don't cache it, which means we don't round trip it + if (description != null) { + put(DESCRIPTION_ATTRIBUTE, description); + } + } + }; } /** - * create a VCF format header line - * - * @param line the header line - * @param version the VCF header version - * @param lineType the header line type - * + * Compare two VCFCompoundHeaderLine (FORMAT or INFO) lines to determine if they have compatible number types, + * and return a VCFCompoundHeaderLine that can be used to represent the result of merging these lines. In the + * case where the merged line requires "promoting" one of the types to the other, a new line of the appropriate + * type is created by calling the {@code compoundHeaderLineResolver} to produce new line of the correct + * subclass (INFO or FORMAT). + * @param line1 + * @param line2 + * @param conflictWarner + * @param compoundHeaderLineResolver function that accepts two compound header lines of the same type (info or + * format, and returns a new header line representing the combination of the + * two input header lines + * @return */ - protected VCFCompoundHeaderLine(String line, VCFHeaderVersion version, SupportedHeaderLineType lineType) { - super(lineType.toString(), ""); - - final ArrayList expectedTags = new ArrayList(Arrays.asList("ID", "Number", "Type", "Description")); - if (version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_2)) - expectedTags.add("Version"); - final Map mapping = VCFHeaderLineTranslator.parseLine(version, line, expectedTags); - name = mapping.get("ID"); - count = -1; - final String numberStr = mapping.get("Number"); - if (numberStr.equals(VCFConstants.PER_ALTERNATE_COUNT)) { - countType = VCFHeaderLineCount.A; - } else if (numberStr.equals(VCFConstants.PER_ALLELE_COUNT)) { - countType = VCFHeaderLineCount.R; - } else if (numberStr.equals(VCFConstants.PER_GENOTYPE_COUNT)) { - countType = VCFHeaderLineCount.G; - } else if ((version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) && numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v4)) || - (!version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) && numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v3))) { - countType = VCFHeaderLineCount.UNBOUNDED; - } else { - countType = VCFHeaderLineCount.INTEGER; - count = Integer.valueOf(numberStr); - - } - - if (count < 0 && countType == VCFHeaderLineCount.INTEGER) - throw new TribbleException.InvalidHeader("Count < 0 for fixed size VCF header field " + name); - - try { - type = VCFHeaderLineType.valueOf(mapping.get("Type")); - } catch (Exception e) { - throw new TribbleException(mapping.get("Type") + " is not a valid type in the VCF specification (note that types are case-sensitive)"); - } - if (type == VCFHeaderLineType.Flag && !allowFlagValues()) - throw new IllegalArgumentException("Flag is an unsupported type for this kind of field"); + public static VCFCompoundHeaderLine getSmartMergedCompoundHeaderLine( + final VCFCompoundHeaderLine line1, + final VCFCompoundHeaderLine line2, + final VCFHeader.HeaderConflictWarner conflictWarner, + BiFunction compoundHeaderLineResolver) + { + Utils. nonNull(line1); + Utils. nonNull(line2); - description = mapping.get("Description"); - if (description == null && ALLOW_UNBOUND_DESCRIPTIONS) // handle the case where there's no description provided - description = UNBOUND_DESCRIPTION; + VCFCompoundHeaderLine newLine = line1; - this.lineType = lineType; - - validate(); - } - - private void validate() { - if (name == null || type == null || description == null || lineType == null) - throw new IllegalArgumentException(String.format("Invalid VCFCompoundHeaderLine: key=%s name=%s type=%s desc=%s lineType=%s", - super.getKey(), name, type, description, lineType)); - if (name.contains("<") || name.contains(">")) - throw new IllegalArgumentException("VCFHeaderLine: ID cannot contain angle brackets"); - if (name.contains("=")) - throw new IllegalArgumentException("VCFHeaderLine: ID cannot contain an equals sign"); - - if (type == VCFHeaderLineType.Flag && count != 0) { - count = 0; - if (GeneralUtils.DEBUG_MODE_ENABLED) { - System.err.println("FLAG fields must have a count value of 0, but saw " + count + " for header line " + getID() + ". Changing it to 0 inside the code"); + // Note: this can drop extra attributes + if (!line1.equalsExcludingExtraAttributes(line2)) { + if (line1.getType().equals(line2.getType())) { + // The lines are different in some way, but have a common type. + // The Number entry is an Integer that describes the number of values that can be + // included with the INFO field. For example, if the INFO field contains a single + // number, then this value should be 1. However, if the INFO field describes a pair + // of numbers, then this value should be 2 and so on. If the number of possible + // values varies, is unknown, or is unbounded, then this value should be '.'. + conflictWarner.warn(line1, "Promoting header field Number to . due to number differences in header lines: " + line1 + " " + line2); + newLine = compoundHeaderLineResolver.apply(line1, line2); + } else if (line1.getType() == VCFHeaderLineType.Integer && line2.getType() == VCFHeaderLineType.Float) { + // promote key to Float + conflictWarner.warn(line1, "Promoting Integer to Float in header: " + line2); + newLine = line2; + } else if (line1.getType() == VCFHeaderLineType.Float && line2.getType() == VCFHeaderLineType.Integer) { + // promote key to Float + conflictWarner.warn(line1, "Promoting Integer to Float in header: " + line2); + } else { + throw new IllegalStateException("Incompatible header types, collision between these two types: " + line1 + " " + line2); } } - } - - /** - * make a string representation of this header line - * @return a string representation - */ - @Override - protected String toStringEncoding() { - Map map = new LinkedHashMap(); - map.put("ID", name); - Object number; - switch (countType) { - case A: - number = VCFConstants.PER_ALTERNATE_COUNT; - break; - case R: - number = VCFConstants.PER_ALLELE_COUNT; - break; - case G: - number = VCFConstants.PER_GENOTYPE_COUNT; - break; - case UNBOUNDED: - number = VCFConstants.UNBOUNDED_ENCODING_v4; - break; - case INTEGER: - default: - number = count; + if (!line1.getDescription().equals(line2.getDescription())) { + conflictWarner.warn(line1, "Allowing unequal description fields through: keeping " + line2 + " excluding " + line1); } - map.put("Number", number); - map.put("Type", type); - map.put("Description", description); - return lineType.toString() + "=" + VCFHeaderLine.toStringEncoding(map); + + return newLine; } /** @@ -260,44 +342,20 @@ public boolean equals(final Object o) { if ( this == o ) { return true; } - if ( o == null || getClass() != o.getClass() || ! super.equals(o) ) { + if ( o == null || getClass() != o.getClass() ) { return false; } - final VCFCompoundHeaderLine that = (VCFCompoundHeaderLine) o; - return equalsExcludingDescription(that) && - description.equals(that.description); - } - - @Override - public int hashCode() { - int result = super.hashCode(); - result = 31 * result + name.hashCode(); - result = 31 * result + count; - result = 31 * result + (countType != null ? countType.hashCode() : 0); // only nullable field according to validate() - result = 31 * result + description.hashCode(); - result = 31 * result + type.hashCode(); - result = 31 * result + lineType.hashCode(); - return result; + // let the attribute list determine equality + return super.equals(o); } - public boolean equalsExcludingDescription(VCFCompoundHeaderLine other) { + private boolean equalsExcludingExtraAttributes(VCFCompoundHeaderLine other) { return count == other.count && countType == other.countType && type == other.type && - lineType == other.lineType && - name.equals(other.name); - } - - public boolean sameLineTypeAndName(VCFCompoundHeaderLine other) { - return lineType == other.lineType && - name.equals(other.name); + getKey().equals(other.getKey()) && + getID().equals(other.getID()); } - /** - * do we allow flag (boolean) values? (i.e. booleans where you don't have specify the value, AQ means AQ=true) - * @return true if we do, false otherwise - */ - abstract boolean allowFlagValues(); - } diff --git a/src/main/java/htsjdk/variant/vcf/VCFConstants.java b/src/main/java/htsjdk/variant/vcf/VCFConstants.java index 6a52d1df0e..11f12cf07c 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFConstants.java +++ b/src/main/java/htsjdk/variant/vcf/VCFConstants.java @@ -45,7 +45,7 @@ public final class VCFConstants { public static final String GENOTYPE_KEY = "GT"; public static final String GENOTYPE_POSTERIORS_KEY = "GP"; public static final String GENOTYPE_QUALITY_KEY = "GQ"; - public static final String GENOTYPE_ALLELE_DEPTHS = "AD"; //AD isn't reserved, but is specifically handled by VariantContext + public static final String GENOTYPE_ALLELE_DEPTHS = "AD"; //AD is now reserved public static final String GENOTYPE_PL_KEY = "PL"; // phred-scaled genotype likelihoods public static final String EXPECTED_ALLELE_COUNT_KEY = "EC"; @Deprecated public static final String GENOTYPE_LIKELIHOODS_KEY = "GL"; // log10 scaled genotype likelihoods @@ -86,12 +86,37 @@ public final class VCFConstants { public static final String PHASING_TOKENS = "/|\\"; // header lines - public static final String FILTER_HEADER_START = "##FILTER"; - public static final String FORMAT_HEADER_START = "##FORMAT"; - public static final String INFO_HEADER_START = "##INFO"; - public static final String ALT_HEADER_START = "##ALT"; + public static final String FILTER_HEADER_KEY = "FILTER"; + public static final String FILTER_HEADER_START = VCFHeader.METADATA_INDICATOR + FILTER_HEADER_KEY; + public static final int FILTER_HEADER_OFFSET = FILTER_HEADER_START.length() + 1; + + public static final String FORMAT_HEADER_KEY = "FORMAT"; + public static final String FORMAT_HEADER_START = VCFHeader.METADATA_INDICATOR + FORMAT_HEADER_KEY; + public static final int FORMAT_HEADER_OFFSET = FORMAT_HEADER_START.length() + 1; + + public static final String INFO_HEADER_KEY = "INFO"; + public static final String INFO_HEADER_START = VCFHeader.METADATA_INDICATOR + INFO_HEADER_KEY; + public static final int INFO_HEADER_OFFSET = INFO_HEADER_START.length() + 1; + + public static final String ALT_HEADER_KEY = "ALT"; + public static final String ALT_HEADER_START = VCFHeader.METADATA_INDICATOR + ALT_HEADER_KEY; + public static final int ALT_HEADER_OFFSET = ALT_HEADER_START.length() + 1; + + public static final String PEDIGREE_HEADER_KEY = "PEDIGREE"; + public static final String PEDIGREE_HEADER_START = VCFHeader.METADATA_INDICATOR + PEDIGREE_HEADER_KEY; + public static final int PEDIGREE_HEADER_OFFSET = PEDIGREE_HEADER_START.length() + 1; + + public static final String SAMPLE_HEADER_KEY = "SAMPLE"; + public static final String SAMPLE_HEADER_START = VCFHeader.METADATA_INDICATOR + SAMPLE_HEADER_KEY; + public static final int SAMPLE_HEADER_OFFSET = SAMPLE_HEADER_START.length() + 1; + + public static final String META_HEADER_KEY = "META"; + public static final String META_HEADER_START = VCFHeader.METADATA_INDICATOR + META_HEADER_KEY; + public static final int META_HEADER_OFFSET = META_HEADER_START.length() + 1; + public static final String CONTIG_HEADER_KEY = "contig"; - public static final String CONTIG_HEADER_START = "##" + CONTIG_HEADER_KEY; + public static final String CONTIG_HEADER_START = VCFHeader.METADATA_INDICATOR + CONTIG_HEADER_KEY; + public static final int CONTIG_HEADER_OFFSET = CONTIG_HEADER_START.length() + 1; // old indel alleles public static final char DELETION_ALLELE_v3 = 'D'; diff --git a/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java index 12e400c95c..fb0bad9116 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java @@ -26,10 +26,11 @@ package htsjdk.variant.vcf; import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.util.Log; import htsjdk.tribble.TribbleException; -import java.util.LinkedHashMap; -import java.util.Map; +import java.util.*; +import java.util.regex.Pattern; /** * A special class representing a contig VCF header line. Knows the true contig order and sorts on that @@ -38,52 +39,142 @@ * * @author mdepristo */ -public class VCFContigHeaderLine extends VCFSimpleHeaderLine { +public class VCFContigHeaderLine extends VCFStructuredHeaderLine { + private static final long serialVersionUID = 1L; + protected final static Log logger = Log.getInstance(VCFContigHeaderLine.class); + + final static Pattern VALID_CONTIG_ID_PATTERN = Pattern.compile("[!-)+-<>-~][!-~]*"); + final Integer contigIndex; + public static String LENGTH_ATTRIBUTE = "length"; + public static String ASSEMBLY_ATTRIBUTE = "assembly"; + public static String MD5_ATTRIBUTE = "md5"; + public static String URL_ATTRIBUTE = "URL"; + public static String SPECIES_ATTRIBUTE = "species"; + /** * create a VCF contig header line * * @param line the header line * @param version the vcf header version - * @param key the key for this header line + * @param contigIndex */ - public VCFContigHeaderLine(final String line, final VCFHeaderVersion version, final String key, final int contigIndex) { - super(line, version, key, null); - if (contigIndex < 0) throw new TribbleException("The contig index is less than zero."); - this.contigIndex = contigIndex; + public VCFContigHeaderLine(final String line, final VCFHeaderVersion version, final int contigIndex) { + this(VCFHeaderLineTranslator.parseLine( + version, line, Collections.singletonList(VCFStructuredHeaderLine.ID_ATTRIBUTE)), contigIndex); + validateForVersion(version); } public VCFContigHeaderLine(final Map mapping, final int contigIndex) { super(VCFHeader.CONTIG_KEY, mapping); - if (contigIndex < 0) throw new TribbleException("The contig index is less than zero."); + if (contigIndex < 0) { + throw new TribbleException("The contig index is less than zero."); + } this.contigIndex = contigIndex; } - VCFContigHeaderLine(final SAMSequenceRecord sequenceRecord, final String assembly) { - // Using LinkedHashMap to preserve order of keys in contig line (ID, length, assembly) - super(VCFHeader.CONTIG_KEY, new LinkedHashMap() {{ - // Now inside an init block in an anon HashMap subclass - this.put("ID", sequenceRecord.getSequenceName()); - this.put("length", Integer.toString(sequenceRecord.getSequenceLength())); - if ( assembly != null ) this.put("assembly", assembly); - }}); - this.contigIndex = sequenceRecord.getSequenceIndex(); + /** + * Return a VCFContigHeaderLine representing a SAMSequenceRecord. + * + * NOTE: round-tripping between VCFContigHeaderLines and SAMSequenceRecords can be lossy since they + * don't necessarily have equivalent attributes, i.e., SAMSequenceRecord can have a species attribute + * that isn't defined by the VCF spec. + * + * @return VCFContigHeaderLine for the SAMSequenceRecord + */ + public VCFContigHeaderLine(final SAMSequenceRecord sequenceRecord, final String assembly) { + // preserve order of keys in contig line (ID, length, assembly) + this(new LinkedHashMap() {{ + this.put(ID_ATTRIBUTE, sequenceRecord.getSequenceName()); + this.put(LENGTH_ATTRIBUTE, Integer.toString(sequenceRecord.getSequenceLength())); + if (assembly != null) { + if (!assembly.equals(sequenceRecord.getAssembly()) && VCFUtils.getVerboseVCFLogging()) { + logger.warn(String.format( + "Inconsistent \"assembly\" attribute values found while creating VCFContigLine " + + "(with assembly \"%s\") from SAMSequenceRecord (with assembly \"%s\")", + assembly, + sequenceRecord.getAssembly())); + } + this.put(ASSEMBLY_ATTRIBUTE, assembly); + } + if (sequenceRecord.getMd5() != null) { + this.put(MD5_ATTRIBUTE, sequenceRecord.getMd5()); + } + if (sequenceRecord.getAttribute(SAMSequenceRecord.URI_TAG) != null) { + this.put(URL_ATTRIBUTE, sequenceRecord.getAttribute(SAMSequenceRecord.URI_TAG)); + } + if (sequenceRecord.getAttribute(SAMSequenceRecord.SPECIES_TAG) != null) { + this.put(SPECIES_ATTRIBUTE, sequenceRecord.getAttribute(SAMSequenceRecord.SPECIES_TAG)); + } + }}, + sequenceRecord.getSequenceIndex() + ); } + /** + * Return a SAMSequenceRecord representing this contig line. + * + * NOTE: roundtripping between VCFContigHeaderLines and SAMSequenceRecords can be lossy since they + * don't necessarily have equivalent attributes, i.e., SAMSequenceRecord can have a species attribute + * that isn't defined by the VCF spec. + * + * @return SAMSequenceRecord for this contig line + */ + public SAMSequenceRecord getSAMSequenceRecord() { + final String lengthString = this.getGenericFieldValue(LENGTH_ATTRIBUTE); + if (lengthString == null) { + throw new TribbleException("Contig " + this.getID() + " does not have a length field."); + } + final SAMSequenceRecord record = new SAMSequenceRecord(this.getID(), Integer.valueOf(lengthString)); + final String assemblyString = this.getGenericFieldValue(ASSEMBLY_ATTRIBUTE); + if (assemblyString != null) { + record.setAssembly(assemblyString); + } + record.setSequenceIndex(this.contigIndex); + final String md5 = getGenericFieldValue(MD5_ATTRIBUTE); + if (md5 != null) { + record.setMd5(md5); + } + final String url = getGenericFieldValue(URL_ATTRIBUTE); + if (url != null) { + record.setAttribute(SAMSequenceRecord.URI_TAG, url); + } + final String species = getGenericFieldValue(SPECIES_ATTRIBUTE); + if (species != null) { + record.setSpecies(species); + } + return record; + } + + /** + * Validate that this header line conforms to the target version. + */ + @Override + public void validateForVersion(final VCFHeaderVersion vcfTargetVersion) { + super.validateForVersion(vcfTargetVersion); + if (vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { + //TODO: V4.3 The contig names must not use a reserved symbolic allele/exclude the characters + // l/r chevron, l/r bracket, colon, asterisk + if (VALID_CONTIG_ID_PATTERN.matcher(getID()).matches()) { + String message = String.format("Contig headerLineID \"%s\" in \"%s\" header line doesn't conform to VCF contig ID restrictions" , + getID(), + getKey()); + throw new TribbleException.InvalidHeader(message); + } + } + } + public Integer getContigIndex() { return contigIndex; } - public SAMSequenceRecord getSAMSequenceRecord() { - final String lengthString = this.getGenericFieldValue("length"); - if (lengthString == null) throw new TribbleException("Contig " + this.getID() + " does not have a length field."); - final SAMSequenceRecord record = new SAMSequenceRecord(this.getID(), Integer.valueOf(lengthString)); - record.setAssembly(this.getGenericFieldValue("assembly")); - record.setSequenceIndex(this.contigIndex); - return record; - } - + /** + * Note: this class has a natural ordering that is inconsistent with equals() + * + * @param o + * @return + */ @Override public boolean equals(final Object o) { if ( this == o ) { @@ -106,6 +197,8 @@ public int hashCode() { /** * IT IS CRITICAL THAT THIS BE OVERRIDDEN SO WE SORT THE CONTIGS IN THE CORRECT ORDER + * + * Note: this class has a natural ordering that is inconsistent with equals() */ @Override public int compareTo(final Object other) { diff --git a/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java index 5130963acf..be4422a399 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java @@ -25,25 +25,40 @@ package htsjdk.variant.vcf; -import java.util.Arrays; +import htsjdk.tribble.TribbleException; + +import java.util.*; /** * @author ebanks * - * A class representing a key=value entry for FILTER fields in the VCF header + * A class representing FILTER fields in the VCF header */ -public class VCFFilterHeaderLine extends VCFSimpleHeaderLine { - +public class VCFFilterHeaderLine extends VCFStructuredHeaderLine { + private static final long serialVersionUID = 1L; + private static List requiredTagOrder = Collections.unmodifiableList( + new ArrayList(2) {{ + add(ID_ATTRIBUTE); + add(VCFStructuredHeaderLine.DESCRIPTION_ATTRIBUTE); + }} + ); + /** * create a VCF filter header line * - * @param name the name for this header line + * @param id the headerLineID for this header line * @param description the description for this header line */ - public VCFFilterHeaderLine(final String name, final String description) { - super("FILTER", name, description); + public VCFFilterHeaderLine(final String id, final String description) { + super(VCFConstants.FILTER_HEADER_KEY, + new LinkedHashMap(2) {{ + put(ID_ATTRIBUTE, id); + put(DESCRIPTION_ATTRIBUTE, description); + }} + ); + validate(); } /** @@ -51,29 +66,37 @@ public VCFFilterHeaderLine(final String name, final String description) { * @param name */ public VCFFilterHeaderLine(final String name) { - super("FILTER", name, name); + this(name, name); } /** - * create a VCF info header line + * create a VCF filter header line * * @param line the header line * @param version the vcf header version */ public VCFFilterHeaderLine(final String line, final VCFHeaderVersion version) { - super(line, version, "FILTER", Arrays.asList("ID", "Description")); + super(VCFConstants.FILTER_HEADER_KEY, VCFHeaderLineTranslator.parseLine(version, line, requiredTagOrder)); + validate(); + validateForVersion(version); + } + + private void validate() { + if (getDescription() == null) { + throw new TribbleException.InvalidHeader("Missing Description attribute in filter header line"); + } } @Override public boolean shouldBeAddedToDictionary() { return true; } - + /** * get the "Description" field * @return the "Description" field */ public String getDescription() { - return getGenericFieldValue("Description"); + return getGenericFieldValue(DESCRIPTION_ATTRIBUTE); } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFFormatHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFFormatHeaderLine.java index 74f4d5e5e3..dc05a36162 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFFormatHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFFormatHeaderLine.java @@ -26,34 +26,43 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.util.Log; +import htsjdk.tribble.TribbleException; + /** * @author ebanks *

* Class VCFFormatHeaderLine *

*

- * A class representing a key=value entry for genotype FORMAT fields in the VCF header

+ * A class representing genotype FORMAT fields in the VCF header

*/ public class VCFFormatHeaderLine extends VCFCompoundHeaderLine { + private static final long serialVersionUID = 1L; + protected final static Log logger = Log.getInstance(VCFFormatHeaderLine.class); public VCFFormatHeaderLine(String name, int count, VCFHeaderLineType type, String description) { - super(name, count, type, description, SupportedHeaderLineType.FORMAT); - if (type == VCFHeaderLineType.Flag) - throw new IllegalArgumentException("Flag is an unsupported type for format fields"); + super(VCFConstants.FORMAT_HEADER_KEY, name, count, type, description); + validate(); } public VCFFormatHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description) { - super(name, count, type, description, SupportedHeaderLineType.FORMAT); + super(VCFConstants.FORMAT_HEADER_KEY, name, count, type, description); + validate(); } public VCFFormatHeaderLine(String line, VCFHeaderVersion version) { - super(line, version, SupportedHeaderLineType.FORMAT); + super(VCFConstants.FORMAT_HEADER_KEY, + VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrder), + version); + validate(); + validateForVersion(version); } - // format fields do not allow flag values (that wouldn't make much sense, how would you encode this in the genotype). - @Override - boolean allowFlagValues() { - return false; + private void validate() { + if (this.getType() == VCFHeaderLineType.Flag) { + throw new TribbleException("Flag is an unsupported type for format fields"); + } } @Override diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeader.java b/src/main/java/htsjdk/variant/vcf/VCFHeader.java index 30dce37e65..dd409a593a 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeader.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeader.java @@ -26,52 +26,46 @@ package htsjdk.variant.vcf; import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.util.Log; import htsjdk.tribble.TribbleException; import htsjdk.tribble.util.ParsingUtils; +import htsjdk.utils.Utils; import htsjdk.variant.utils.GeneralUtils; import htsjdk.variant.variantcontext.VariantContextComparator; import java.io.Serializable; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedHashMap; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; - +import java.util.*; +import java.util.stream.Collectors; /** - * A class to represent a VCF header + * A class to represent a VCF header. + * + * VCFHeaders maintain a VCFHeaderVersion that is established via the following precedence: + * + * - derived from a ##fileformat line embedded in the metadata lines list + * - supplied in a constructor + * - the default header version, currently vcfv42 * - * @author aaron - * NOTE: This class stores header lines in lots of places. The original author noted that this should - * be cleaned up at some point in the future (jgentry - 5/2013) + * Any attempt to add metadata lines, or change the header version via {@link #setHeaderVersion} will + * triggera validation pass against the metadata lines to ensure they conform to the rules defined by + * the VCF specification for that version. */ public class VCFHeader implements Serializable { public static final long serialVersionUID = 1L; + protected final static Log logger = Log.getInstance(VCFHeader.class); // the mandatory header fields public enum HEADER_FIELDS { CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO } - // the associated meta data - private final Set mMetaData = new LinkedHashSet(); - private final Map mInfoMetaData = new LinkedHashMap(); - private final Map mFormatMetaData = new LinkedHashMap(); - private final Map mFilterMetaData = new LinkedHashMap(); - private final Map mOtherMetaData = new LinkedHashMap(); - private final List contigMetaData = new ArrayList(); + // TODO: Should we reject attempts to add two contig header lines with the same contigIndex ? + // TODO: GATK VcfUtilsUnitTest.createHeaderLines test creates headers with contig lines with identical (0) indices + // The associated meta data + private final VCFMetaDataLines mMetaData = new VCFMetaDataLines(); - // the list of auxillary tags - private final List mGenotypeSampleNames = new ArrayList(); + // the list of auxiliary tags + private final List mGenotypeSampleNames = new ArrayList<>(); // the character string that indicates meta data public static final String METADATA_INDICATOR = "##"; @@ -98,47 +92,91 @@ public enum HEADER_FIELDS { private boolean writeEngineHeaders = true; private boolean writeCommandLine = true; + // VCFVersion for this header. Default to VCFv4.2. + public final static VCFHeaderVersion DEFAULT_VCF_VERSION = VCFHeaderVersion.VCF4_2; + private VCFHeaderVersion vcfVersion; + /** * Create an empty VCF header with no header lines and no samples */ - public VCFHeader() { - this(Collections.emptySet(), Collections.emptySet()); - } + public VCFHeader() { this.vcfVersion = DEFAULT_VCF_VERSION; } /** - * create a VCF header, given a list of meta data and auxiliary tags + * Create a VCF header, given a list of meta data and auxiliary tags * - * @param metaData the meta data associated with this header + * @param metaData the meta data associated with this header */ public VCFHeader(final Set metaData) { - mMetaData.addAll(metaData); - removeVCFVersionLines(mMetaData); - createLookupEntriesForAllHeaderLines(); - checkForDeprecatedGenotypeLikelihoodsKey(); + this(null, metaData, new HashSet<>()); } /** - * Creates a deep copy of the given VCFHeader, duplicating all its metadata and + * Creates a copy of the given VCFHeader, duplicating all it's metadata and * sample names. */ public VCFHeader(final VCFHeader toCopy) { - this(toCopy.mMetaData, toCopy.mGenotypeSampleNames); + // TODO: this constructor doesn't propagate all existing state (writeEngineHeaders, etc) + this(toCopy.getHeaderVersion(), toCopy.getMetaDataInInputOrder(), toCopy.mGenotypeSampleNames); } /** - * create a VCF header, given a list of meta data and auxillary tags + * Create a VCF header, given a set of meta data and auxiliary tags * - * @param metaData the meta data associated with this header + * @param metaData set of meta data associated with this header * @param genotypeSampleNames the sample names */ public VCFHeader(final Set metaData, final Set genotypeSampleNames) { - this(metaData, new ArrayList(genotypeSampleNames)); + this(null, metaData, new ArrayList<>(genotypeSampleNames)); } public VCFHeader(final Set metaData, final List genotypeSampleNames) { - this(metaData); + this(null, metaData, genotypeSampleNames); + } - if ( genotypeSampleNames.size() != new HashSet(genotypeSampleNames).size() ) + /** + * create a VCF header, given a set of meta data and auxiliary tags + * + * @param vcfHeaderVersion vcfHeader version (against which the header lines will be validated) + * @param metaData set of meta data associated with this header + * @param genotypeSampleNames the sample names + */ + public VCFHeader( + final VCFHeaderVersion vcfHeaderVersion, + final Set metaData, + final Set genotypeSampleNames) { + this(vcfHeaderVersion, metaData, new ArrayList<>(genotypeSampleNames)); + } + + /** + * Create a versioned VCF header. + * + * @param vcfHeaderVersion requested header version. The header version for this header. Can be null, in which + * case the header version is determined by an embedded ##fileformat metadata line, if + * any, or the default vcf version. If non null, the version defined by any embedded + * ##fileformat lines has precedence. + * @param metaData The metada lines for this header. The lines must be valid for the version for this header. + * @param genotypeSampleNames + */ + public VCFHeader( + final VCFHeaderVersion vcfHeaderVersion, + final Set metaData, + final List genotypeSampleNames) + { + Utils.nonNull(metaData); + Utils.nonNull(genotypeSampleNames); + + // Establish the version for this header using the following precedence: + // 1) the version defined by any ##fileformat metadata line in the metadata list + // 2) the requested version argument, if any (warn if this conflicts with the embedded fileformat) + // 3) the default VCFHeaderVersion + this.vcfVersion = establishHeaderVersion(vcfHeaderVersion, metaData); + + mMetaData.addAllMetaDataLines(metaData); + mMetaData.validateMetaDataLines(this.vcfVersion); + + checkForDeprecatedGenotypeLikelihoodsKey(); + + if ( genotypeSampleNames.size() != new HashSet<>(genotypeSampleNames).size() ) throw new TribbleException.InvalidHeader("BUG: VCF header has duplicate sample names"); mGenotypeSampleNames.addAll(genotypeSampleNames); @@ -146,87 +184,169 @@ public VCFHeader(final Set metaData, final List genotypeS buildVCFReaderMaps(genotypeSampleNames); } + + /** + * Establish the header version using the following precedence: + * 1) the version defined by any ##fileformat metadata line in the metadata list + * 2) the requested version argument, if any (warn if this conflicts with the embedded fileformat) + * 3) the default VCFHeaderVersion + * @param requestedVCFHeaderVersion + * @param metaData + * @return vcfHeaderVersion to be used for the header + */ + private VCFHeaderVersion establishHeaderVersion( + final VCFHeaderVersion requestedVCFHeaderVersion, + final Set metaData) + { + VCFHeaderLine embeddedVersionLine = getVersionLineFromHeaderLineSet(metaData); + if (embeddedVersionLine == null) { + return requestedVCFHeaderVersion == null ? + DEFAULT_VCF_VERSION : + requestedVCFHeaderVersion; // use the requested version + } else { // embeddedVersionLine not null, reconcile with requested version + VCFHeaderVersion embeddedHeaderVersion = VCFHeaderVersion.toHeaderVersion(embeddedVersionLine.getValue()); + if (requestedVCFHeaderVersion != null && + !requestedVCFHeaderVersion.equals(embeddedHeaderVersion)) { + final String message = String.format("VCFHeader metadata version (%s) is inconsistent with requested version (%s). " + + "Falling back to metadata version.", + embeddedHeaderVersion, + requestedVCFHeaderVersion); + if (VCFUtils.getStrictVCFVersionValidation()) { + throw new IllegalArgumentException(message); + } + if (VCFUtils.getVerboseVCFLogging()) { + logger.warn(message); + } + } + return embeddedHeaderVersion; + } + } + + /** + * Get the header version for this header. + * @return the VCFHeaderVersion for this header. + */ + public VCFHeaderVersion getHeaderVersion() { + return vcfVersion; + } + + /** + * Set the version header for this class. + * + * Validates all metadata line to ensure they conform to the target header version (i.e, if the metadata lines + * contain a ##fileformat line that specifies a version that is different than the {@code newVCFVersion}. + * + * @param newVCFVersion + */ + public void setHeaderVersion(final VCFHeaderVersion newVCFVersion) { + Utils.nonNull(newVCFVersion, "A non-null VCFHeaderVersion must be provided"); + if (!newVCFVersion.equals(vcfVersion)) { + logger.warn(String.format("Changing VCFHeader version from %s to %s", + vcfVersion.getVersionString(), + newVCFVersion.getVersionString())); + // TODO: This can cause failures in code that used to succeed (i.e. Picard LiftOverVcf tests fail + // if they're not modified to remove the embedded version line) since we now retain ##fileformat + // lines in the metadata list; the validation code recognizes and validates the embedded ##fileformat + // lines against the new version, and throws if they conflict. + // + // We might want to add a removeHeaderLine method to VCFHeader so that consumers with this problem + // such as LiftOverVcf can first manually remove the embedded fileformat line (currently you'd have + // to create a new header to achieve that). + mMetaData.validateMetaDataLines(newVCFVersion); + } + } + + /** + * Adds a new line to the VCFHeader. If there is an existing header line of the + * same type with the same key, and the header version is pre-4.3, the new line is added + * using a modified key to make it unique for BWC; otherwise (in strict validation mode), an + * exception is thrown since duplicates are not allowed. + * + * @param headerLine header line to attempt to add + */ + public void addMetaDataLine(final VCFHeaderLine headerLine) { + mMetaData.validateMetaDataLine(vcfVersion, headerLine); + mMetaData.addMetaDataLine(headerLine); + checkForDeprecatedGenotypeLikelihoodsKey(); + } + /** * Tell this VCF header to use pre-calculated sample name ordering and the * sample name -> offset map. This assumes that all VariantContext created * using this header (i.e., read by the VCFCodec) will have genotypes * occurring in the same order * - * @param genotypeSampleNamesInAppearenceOrder genotype sample names, must iterator in order of appearance + * @param genotypeSampleNamesInAppearanceOrder genotype sample names, must iterator in order of appearance */ - private void buildVCFReaderMaps(final Collection genotypeSampleNamesInAppearenceOrder) { - sampleNamesInOrder = new ArrayList(genotypeSampleNamesInAppearenceOrder.size()); - sampleNameToOffset = new HashMap(genotypeSampleNamesInAppearenceOrder.size()); + private void buildVCFReaderMaps(final Collection genotypeSampleNamesInAppearanceOrder) { + sampleNamesInOrder = new ArrayList<>(genotypeSampleNamesInAppearanceOrder.size()); + sampleNameToOffset = new HashMap<>(genotypeSampleNamesInAppearanceOrder.size()); int i = 0; - for (final String name : genotypeSampleNamesInAppearenceOrder) { + for (final String name : genotypeSampleNamesInAppearanceOrder) { sampleNamesInOrder.add(name); sampleNameToOffset.put(name, i++); } Collections.sort(sampleNamesInOrder); } - /** - * Adds a new line to the VCFHeader. If there is an existing header line of the - * same type with the same key, the new line is not added and the existing line - * is preserved. + * Find and return the VCF fileformat/version line * - * @param headerLine header line to attempt to add - */ - public void addMetaDataLine(final VCFHeaderLine headerLine) { - // Try to create a lookup entry for the new line. If this succeeds (because there was - // no line of this type with the same key), add the line to our master list of header - // lines in mMetaData. - if ( addMetadataLineLookupEntry(headerLine) ) { - mMetaData.add(headerLine); - checkForDeprecatedGenotypeLikelihoodsKey(); + * Return null if no fileformat/version lines are found + */ + protected static VCFHeaderLine getVersionLineFromHeaderLineSet(final Set metaDataLines) { + VCFHeaderLine versionLine = null; + final List formatLines = new ArrayList<>(); + for (final VCFHeaderLine headerLine : metaDataLines) { + if (VCFHeaderVersion.isFormatString(headerLine.getKey())) { + formatLines.add(headerLine); + } + } + + if (!formatLines.isEmpty()) { + if (formatLines.size() > 1) { + //TODO: should this throw, or log, or remove all but one (if the duplicates are consistent) ? + throw new TribbleException("Multiple version header lines found in header line list"); + } + return formatLines.get(0); } + + return versionLine; } /** * @return all of the VCF header lines of the ##contig form in order, or an empty list if none were present */ public List getContigLines() { - return Collections.unmodifiableList(contigMetaData); - } + return mMetaData.getContigLines(); + } /** - * Returns the contigs in this VCF file as a SAMSequenceDictionary. Returns null if contigs lines are - * not present in the header. Throws SAMException if one or more contig lines do not have length + * Returns the contigs in this VCF Header as a SAMSequenceDictionary. + * + * @return Returns null if contig lines are not present in the header. + * @throws TribbleException if one or more contig lines do not have length * information. */ public SAMSequenceDictionary getSequenceDictionary() { final List contigHeaderLines = this.getContigLines(); - if (contigHeaderLines.isEmpty()) return null; - - final List sequenceRecords = new ArrayList(contigHeaderLines.size()); - for (final VCFContigHeaderLine contigHeaderLine : contigHeaderLines) { - sequenceRecords.add(contigHeaderLine.getSAMSequenceRecord()); - } - - return new SAMSequenceDictionary(sequenceRecords); + return contigHeaderLines.isEmpty() ? null : + new SAMSequenceDictionary( + contigHeaderLines.stream() + .map(contigLine -> contigLine.getSAMSequenceRecord()) + .collect(Collectors.toCollection(ArrayList::new)) + ); } /** - * Completely replaces the contig records in this header with those in the given SAMSequenceDictionary. + * Completely replaces all contig header lines in this header with ones derived from the given SAMSequenceDictionary. + * + * @param dictionary SAMSequenceDictionary to use to create VCFContigHeaderLines for this header */ public void setSequenceDictionary(final SAMSequenceDictionary dictionary) { - this.contigMetaData.clear(); - - // Also need to remove contig record lines from mMetaData - final List toRemove = new ArrayList(); - for (final VCFHeaderLine line : mMetaData) { - if (line instanceof VCFContigHeaderLine) { - toRemove.add(line); - } - } - mMetaData.removeAll(toRemove); - for (final SAMSequenceRecord record : dictionary.getSequences()) { - contigMetaData.add(new VCFContigHeaderLine(record, record.getAssembly())); - } - - this.mMetaData.addAll(contigMetaData); + getContigLines().forEach(hl -> mMetaData.removeHeaderLine(hl)); + dictionary.getSequences().forEach(r -> addMetaDataLine(new VCFContigHeaderLine(r, r.getAssembly()))); } public VariantContextComparator getVCFRecordComparator() { @@ -236,131 +356,12 @@ public VariantContextComparator getVCFRecordComparator() { /** * @return all of the VCF FILTER lines in their original file order, or an empty list if none were present */ - public List getFilterLines() { - final List filters = new ArrayList(); - for (final VCFHeaderLine line : mMetaData) { - if ( line instanceof VCFFilterHeaderLine ) { - filters.add((VCFFilterHeaderLine)line); - } - } - return filters; - } + public List getFilterLines() { return mMetaData.getFilterLines(); } /** - * @return all of the VCF FILTER lines in their original file order, or an empty list if none were present + * @return all of the VCF ID lines in their original file order, or an empty list if none were present */ - public List getIDHeaderLines() { - final List filters = new ArrayList(); - for (final VCFHeaderLine line : mMetaData) { - if (line instanceof VCFIDHeaderLine) { - filters.add((VCFIDHeaderLine)line); - } - } - return filters; - } - - /** - * Remove all lines with a VCF version tag from the provided set of header lines - */ - private void removeVCFVersionLines( final Set headerLines ) { - final List toRemove = new ArrayList(); - for (final VCFHeaderLine line : headerLines) { - if (VCFHeaderVersion.isFormatString(line.getKey())) { - toRemove.add(line); - } - } - headerLines.removeAll(toRemove); - } - - /** - * Creates lookup table entries for all header lines in mMetaData. - */ - private void createLookupEntriesForAllHeaderLines() { - for (final VCFHeaderLine line : mMetaData) { - addMetadataLineLookupEntry(line); - } - } - - /** - * Add a single header line to the appropriate type-specific lookup table (but NOT to the master - * list of lines in mMetaData -- this must be done separately if desired). - * - * If a header line is present that has the same key as an existing line, it will not be added. A warning - * will be shown if this occurs when GeneralUtils.DEBUG_MODE_ENABLED is true, otherwise this will occur - * silently. - * - * @param line header line to attempt to add to its type-specific lookup table - * @return true if the line was added to the appropriate lookup table, false if there was an existing - * line with the same key and the new line was not added - */ - private boolean addMetadataLineLookupEntry(final VCFHeaderLine line) { - if ( line instanceof VCFInfoHeaderLine ) { - final VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line; - return addMetaDataLineMapLookupEntry(mInfoMetaData, infoLine.getID(), infoLine); - } else if ( line instanceof VCFFormatHeaderLine ) { - final VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line; - return addMetaDataLineMapLookupEntry(mFormatMetaData, formatLine.getID(), formatLine); - } else if ( line instanceof VCFFilterHeaderLine ) { - final VCFFilterHeaderLine filterLine = (VCFFilterHeaderLine)line; - return addMetaDataLineMapLookupEntry(mFilterMetaData, filterLine.getID(), filterLine); - } else if ( line instanceof VCFContigHeaderLine ) { - return addContigMetaDataLineLookupEntry((VCFContigHeaderLine) line); - } else { - return addMetaDataLineMapLookupEntry(mOtherMetaData, line.getKey(), line); - } - } - - /** - * Add a contig header line to the lookup list for contig lines (contigMetaData). If there's - * already a contig line with the same ID, does not add the line. - * - * Note: does not add the contig line to the master list of header lines in mMetaData -- - * this must be done separately if desired. - * - * @param line contig header line to add - * @return true if line was added to the list of contig lines, otherwise false - */ - private boolean addContigMetaDataLineLookupEntry(final VCFContigHeaderLine line) { - for (VCFContigHeaderLine vcfContigHeaderLine : contigMetaData) { - // if we are trying to add a contig for the same ID - if (vcfContigHeaderLine.getID().equals(line.getID())) { - if ( GeneralUtils.DEBUG_MODE_ENABLED ) { - System.err.println("Found duplicate VCF contig header lines for " + line.getID() + "; keeping the first only" ); - } - // do not add this contig if it exists - return false; - } - } - - contigMetaData.add(line); - return true; - } - - /** - * Add a header line to the provided map at a given key. If the key already exists, it will not be replaced. - * If it does already exist and GeneralUtils.DEBUG_MODE_ENABLED is true, it will issue warnings about duplicates, - * otherwise it will silently leave the existing key/line pair as is. - * - * Note: does not add the header line to the master list of header lines in mMetaData -- - * this must be done separately if desired. - * - * @param map a map from each key to the associated VCFHeaderLine - * @param key the key to insert this line at - * @param line the line to insert at this key - * @param a type of vcf header line that extends VCFHeaderLine - * @return true if the line was added to the map, false if it was not added because there's already a line with that key - */ - private boolean addMetaDataLineMapLookupEntry(final Map map, final String key, final T line) { - if ( map.containsKey(key) ) { - if ( GeneralUtils.DEBUG_MODE_ENABLED ) { - System.err.println("Found duplicate VCF header lines for " + key + "; keeping the first only" ); - } - return false; - } - - map.put(key, line); - return true; - } + public List getStructuredHeaderLines() { return mMetaData.getStructuredHeaderLines(); } /** * Check for the presence of a format line with the deprecated key {@link VCFConstants#GENOTYPE_LIKELIHOODS_KEY}. @@ -369,12 +370,16 @@ private boolean addMetaDataLineMapLookupEntry(final Ma */ private void checkForDeprecatedGenotypeLikelihoodsKey() { if ( hasFormatLine(VCFConstants.GENOTYPE_LIKELIHOODS_KEY) && ! hasFormatLine(VCFConstants.GENOTYPE_PL_KEY) ) { - if ( GeneralUtils.DEBUG_MODE_ENABLED ) { - System.err.println("Found " + VCFConstants.GENOTYPE_LIKELIHOODS_KEY + " format, but no " + if ( VCFUtils.getVerboseVCFLogging() ) { + logger.warn("Found " + VCFConstants.GENOTYPE_LIKELIHOODS_KEY + " format, but no " + VCFConstants.GENOTYPE_PL_KEY + " field. We now only manage PL fields internally" + " automatically adding a corresponding PL field to your VCF header"); } - addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); + addMetaDataLine(new VCFFormatHeaderLine( + VCFConstants.GENOTYPE_PL_KEY, + VCFHeaderLineCount.G, + VCFHeaderLineType.Integer, + "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); } } @@ -385,42 +390,33 @@ private void checkForDeprecatedGenotypeLikelihoodsKey() { * @return a set of the header fields, in order */ public Set getHeaderFields() { - return new LinkedHashSet(Arrays.asList(HEADER_FIELDS.values())); + return new LinkedHashSet<>(Arrays.asList(HEADER_FIELDS.values())); } /** - * get the meta data, associated with this header, in sorted order + * get the meta data, associated with this header, in input order * * @return a set of the meta data */ - public Set getMetaDataInInputOrder() { - return makeGetMetaDataSet(mMetaData); - } + public Set getMetaDataInInputOrder() { return mMetaData.getMetaDataInInputOrder(); } - public Set getMetaDataInSortedOrder() { - return makeGetMetaDataSet(new TreeSet(mMetaData)); - } - - private static Set makeGetMetaDataSet(final Set headerLinesInSomeOrder) { - final Set lines = new LinkedHashSet(); - lines.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_2.getFormatString(), VCFHeaderVersion.VCF4_2.getVersionString())); - lines.addAll(headerLinesInSomeOrder); - return Collections.unmodifiableSet(lines); - } + //TODO: NOTE: since this returns all of the metadata lines, including the fileformat version line, + // in sorted order, the fileformat line is almost certainly not the first line in the list. + /** + * Get the metadata associated with this header in sorted order. + * + * @return Metadata lines in sorted order (based on lexicographical sort of string encodings). + */ + public Set getMetaDataInSortedOrder() { return mMetaData.getMetaDataInSortedOrder(); } + // TODO: Is it useful to retain this method ? It returns the first match for the given key. Should we + // deprecate it (and add a new one that returns a Collection) or just change it to return a Collection ? /** * Get the VCFHeaderLine whose key equals key. Returns null if no such line exists * @param key * @return */ - public VCFHeaderLine getMetaDataLine(final String key) { - for (final VCFHeaderLine line: mMetaData) { - if ( line.getKey().equals(key) ) - return line; - } - - return null; - } + public VCFHeaderLine getMetaDataLine(final String key) { return mMetaData.getMetaDataLine(key); } /** * get the genotyping sample names @@ -461,40 +457,32 @@ public int getColumnCount() { /** * Returns the INFO HeaderLines in their original ordering */ - public Collection getInfoHeaderLines() { - return mInfoMetaData.values(); - } + public Collection getInfoHeaderLines() { return mMetaData.getInfoHeaderLines(); } /** * Returns the FORMAT HeaderLines in their original ordering */ - public Collection getFormatHeaderLines() { - return mFormatMetaData.values(); - } + public Collection getFormatHeaderLines() { return mMetaData.getFormatHeaderLines(); } /** - * @param id the header key name + * @param id the id of the requested header line * @return the meta data line, or null if there is none */ public VCFInfoHeaderLine getInfoHeaderLine(final String id) { - return mInfoMetaData.get(id); + return mMetaData.getInfoHeaderLine(id); } /** - * @param id the header key name + * @param id the id of the requested header line * @return the meta data line, or null if there is none */ - public VCFFormatHeaderLine getFormatHeaderLine(final String id) { - return mFormatMetaData.get(id); - } + public VCFFormatHeaderLine getFormatHeaderLine(final String id) { return mMetaData.getFormatHeaderLine(id); } /** - * @param id the header key name + * @param id the id of the requested header line * @return the meta data line, or null if there is none */ - public VCFFilterHeaderLine getFilterHeaderLine(final String id) { - return mFilterMetaData.get(id); - } + public VCFFilterHeaderLine getFilterHeaderLine(final String id) { return mMetaData.getFilterHeaderLine(id); } public boolean hasInfoLine(final String id) { return getInfoHeaderLine(id) != null; @@ -508,20 +496,20 @@ public boolean hasFilterLine(final String id) { return getFilterHeaderLine(id) != null; } + // TODO: Is this useful ? It returns the first match for the given key, even though there + // can be multiple lines with the same key should we deprecate this method (and leave it and + // add the new one) or just change it to return a collection ? /** - * @param key the header key name + * @param key the of the requested other header line * @return the meta data line, or null if there is none */ - public VCFHeaderLine getOtherHeaderLine(final String key) { - return mOtherMetaData.get(key); - } + public VCFHeaderLine getOtherHeaderLine(final String key) { return mMetaData.getOtherHeaderLine(key); } /** - * Returns the other HeaderLines in their original ordering + * Returns the other HeaderLines in their original ordering, where "other" means any + * VCFHeaderLine that is not a contig, info, format or filter header line. */ - public Collection getOtherHeaderLines() { - return mOtherMetaData.values(); - } + public Collection getOtherHeaderLines() { return mMetaData.getOtherHeaderLines(); } /** * If true additional engine headers will be written to the VCF, otherwise only the walker headers will be output. @@ -565,10 +553,123 @@ public HashMap getSampleNameToOffset() { @Override public String toString() { - final StringBuilder b = new StringBuilder(); - b.append("[VCFHeader:"); - for ( final VCFHeaderLine line : mMetaData ) - b.append("\n\t").append(line); - return b.append("\n]").toString(); + return mMetaData.toString(); + } + + /** + * Return a set of header lines resulting from merging the header lines from two or more headers. The + * headers must be version-compatible as defined by {@link VCFHeaderVersion#versionsAreCompatible}. + * @param headers + * @param emitWarnings + * @return + * @throws IllegalStateException + */ + public static Set getMergedHeaderLines(final Collection headers, final boolean emitWarnings) { + + final VCFMetaDataLines mergedMetaData = new VCFMetaDataLines(); + final HeaderConflictWarner conflictWarner = new HeaderConflictWarner(emitWarnings); + final Set vcfVersions = new HashSet<>(headers.size()); + + for ( final VCFHeader source : headers ) { + validateAllowedVersionMerger(vcfVersions, source.getHeaderVersion()); + for ( final VCFHeaderLine line : source.getMetaDataInSortedOrder()) { + + String key = line.getKey(); + if (VCFHeaderVersion.isFormatString(key)) { + continue; // drop file format strings + } + + final VCFHeaderLine other = mergedMetaData.hasEquivalentHeaderLine(line); + if (other != null && !line.equals(other) ) { + // TODO: NOTE: In order to be equal, structured header lines must have identical attributes + // and values, which is different from the previous implementation for some line types like + // compound header lines. + if (!line.getKey().equals(other.getKey())) { + throw new IllegalArgumentException( + String.format("Attempt to merge incompatible header lines %s/%s", line.getKey(), other.getKey())); + } else if (key.equals(VCFConstants.FORMAT_HEADER_KEY)) { + // Delegate to the resolver function + mergedMetaData.addMetaDataLine(VCFCompoundHeaderLine.getSmartMergedCompoundHeaderLine( + (VCFCompoundHeaderLine) line, + (VCFCompoundHeaderLine) other, + conflictWarner, + (l1, l2) -> new VCFFormatHeaderLine( + l1.getID(), + VCFHeaderLineCount.UNBOUNDED, + l1.getType(), + l1.getDescription()) + ) + ); + } else if (key.equals(VCFConstants.INFO_HEADER_KEY)) { + // Delegate to the resolver function + mergedMetaData.addMetaDataLine(VCFCompoundHeaderLine.getSmartMergedCompoundHeaderLine( + (VCFCompoundHeaderLine) line, + (VCFCompoundHeaderLine) other, + conflictWarner, + (l1, l2) -> new VCFInfoHeaderLine( + l1.getID(), + VCFHeaderLineCount.UNBOUNDED, + l1.getType(), + l1.getDescription()) + ) + ); + } else { + // same type of header line; not equal; but not compound(format/info) + // preserve the existing one; this may drop attributes/values + conflictWarner.warn(line, "Ignoring header line already in map: this header line = " + + line + " already present header = " + other); + } + } else { + mergedMetaData.addMetaDataLine(line); + } + } + } + // returning a LinkedHashSet so that ordering will be preserved. Ensures the contig lines do not get scrambled. + return new LinkedHashSet<>(mergedMetaData.getMetaDataInInputOrder()); + } + + /** + * Pairwise compare the new version we found with every other version we've seen so far and see if any + * are mutually incompatible. + * + * @param sourceVersions + * @param targetVersion + */ + private static void validateAllowedVersionMerger(Set sourceVersions, VCFHeaderVersion targetVersion) { + Utils.nonNull(sourceVersions); + Utils.nonNull(targetVersion); + + Set incompatibleVersions = sourceVersions.stream() + .filter(v -> !VCFHeaderVersion.versionsAreCompatible(v, targetVersion)) + .collect(Collectors.toSet()); + if (!incompatibleVersions.isEmpty()) { + StringBuilder sb = new StringBuilder(); + sb.append(String.format( + "Attempt to merge a version %s header with incompatible vcf headers from versions:", + targetVersion.getVersionString())); + sb.append(incompatibleVersions.stream() + .map(v -> v.getVersionString()) + .collect(Collectors.joining(",")) + ); + throw new TribbleException(sb.toString()); + } } + + /** Only displays a warning if warnings are enabled and an identical warning hasn't been already issued */ + static final class HeaderConflictWarner { + boolean emitWarnings; + Set alreadyIssued = new HashSet(); + + protected HeaderConflictWarner( final boolean emitWarnings ) { + this.emitWarnings = emitWarnings; + } + + public void warn(final VCFHeaderLine line, final String msg) { + if ( emitWarnings && ! alreadyIssued.contains(line.getKey()) ) { + alreadyIssued.add(line.getKey()); + logger.warn(msg); + } + } + } + } diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java index ce12c42730..7a43218640 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java @@ -26,28 +26,20 @@ package htsjdk.variant.vcf; import htsjdk.tribble.TribbleException; +import htsjdk.utils.Utils; import java.io.Serializable; -import java.util.Map; - /** - * @author ebanks - *

- * Class VCFHeaderLine - *

- *

- * A class representing a key=value entry in the VCF header - *

+ *

Class VCFHeaderLine

+ *

A class representing a key=value entry in the VCF header, and the base class for all structured header lines

*/ public class VCFHeaderLine implements Comparable, Serializable { public static final long serialVersionUID = 1L; - protected static final boolean ALLOW_UNBOUND_DESCRIPTIONS = true; - protected static final String UNBOUND_DESCRIPTION = "Not provided in original VCF header"; - - private String mKey = null; - private String mValue = null; + // immutable - we don't want to let the hash value change + private final String mKey; + private final String mValue; /** * create a VCF header line @@ -56,14 +48,9 @@ public class VCFHeaderLine implements Comparable, Serializable { * @param value the value for this header line */ public VCFHeaderLine(String key, String value) { - if ( key == null ) - throw new IllegalArgumentException("VCFHeaderLine: key cannot be null"); - if ( key.contains("<") || key.contains(">") ) - throw new IllegalArgumentException("VCFHeaderLine: key cannot contain angle brackets"); - if ( key.contains("=") ) - throw new IllegalArgumentException("VCFHeaderLine: key cannot contain an equals sign"); mKey = key; mValue = value; + validate(); } /** @@ -85,7 +72,68 @@ public String getValue() { } /** - * By default the header lines won't be added to the dictionary, unless this method will be override (for example in FORMAT, INFO or FILTER header lines) + * @return true if this is a structured header line (has a unique ID and key/value pairs), otherwise false + */ + public boolean isStructuredHeaderLine() { return false; } + + /** + * Return the unique ID for this line. Returns null iff isStructuredHeaderLine is false. + * @return + */ + public String getID() { return null; } + + /** + * Validate the state of this header line. Require the key be valid as an "id". + */ + private void validate() { + validateAsID(mKey, "key"); + } + + /** + * Called when an attempt is made to add this VCFHeaderLine to a VCFHeader, or when an attempt is made + * to change the version of a VCFHeader by changing it's target version. Validates that the header line + * conforms to the target version requirements. + * + * Subclasses should override this to provide type-specific version validation, and the overrides should + * also call super.validateForVersion to allow each class in the class hierarchy to do class-level validation. + */ + public void validateForVersion(final VCFHeaderVersion vcfTargetVersion) { + // If this header line is itself a fileformat/version line, + // make sure it doesn't clash with the new targetVersion. + if (VCFHeaderVersion.isFormatString(getKey())) { + if (!vcfTargetVersion.getFormatString().equals(getKey()) || + !vcfTargetVersion.getVersionString().equals(getValue())) { + throw new TribbleException( + String.format("The fileformat header line \"%s\" is incompatible with version \"%s\"", + this.toStringEncoding(), + vcfTargetVersion.toString())); + } + } + } + + /** + * Validate a string that is to be used as a unique id or key field. + */ + protected static void validateAsID(final String keyString, final String sourceName) { + Utils.nonNull(sourceName); + if (keyString == null) { + throw new TribbleException( + String.format("VCFHeaderLine: A valid %s cannot be null or empty", sourceName)); + } + if ( keyString.contains("<") || keyString.contains(">") ) { + throw new TribbleException( + String.format("VCFHeaderLine: %s cannot contain angle brackets", sourceName)); + } + if ( keyString.contains("=") ) { + throw new TribbleException( + String.format("VCFHeaderLine: %s cannot contain an equals sign", sourceName)); + } + } + + //TODO: this method and it's overrides should be removed since they're for BCF only + /** + * By default the header lines won't be added to the dictionary, unless this method will be override + * (for example in FORMAT, INFO or FILTER header lines) * * @return false */ @@ -140,36 +188,4 @@ public static boolean isHeaderLine(String line) { return line != null && !line.isEmpty() && VCFHeader.HEADER_INDICATOR.equals(line.substring(0,1)); } - /** - * create a string of a mapping pair for the target VCF version - * @param keyValues a mapping of the key->value pairs to output - * @return a string, correctly formatted - */ - public static String toStringEncoding(Map keyValues) { - StringBuilder builder = new StringBuilder(); - builder.append('<'); - boolean start = true; - for (Map.Entry entry : keyValues.entrySet()) { - if (start) start = false; - else builder.append(','); - - if ( entry.getValue() == null ) throw new TribbleException.InternalCodecException("Header problem: unbound value at " + entry + " from " + keyValues); - - builder.append(entry.getKey()); - builder.append('='); - builder.append(entry.getValue().toString().contains(",") || - entry.getValue().toString().contains(" ") || - entry.getKey().equals("Description") ? "\""+ escapeQuotes(entry.getValue().toString()) + "\"" : entry.getValue()); - } - builder.append('>'); - return builder.toString(); - } - - private static String escapeQuotes(final String value) { - // java escaping in a string literal makes this harder to read than it should be - // without string literal escaping and quoting the regex would be: replaceAll( ([^\])" , $1\" ) - // ie replace: something that's not a backslash ([^\]) followed by a double quote - // with: the thing that wasn't a backslash ($1), followed by a backslash, followed by a double quote - return value.replaceAll("([^\\\\])\"", "$1\\\\\""); - } } \ No newline at end of file diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineCount.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineCount.java index 080153a990..c2921ce2d0 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineCount.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineCount.java @@ -25,9 +25,78 @@ package htsjdk.variant.vcf; +import htsjdk.utils.Utils; + /** * the count encodings we use for fields in VCF header lines */ +//TODO: this should really be called VCHFHeaderCountType public enum VCFHeaderLineCount { INTEGER, A, R, G, UNBOUNDED; + + // A default int value used to represent an integral count value (not a count *type*) when the + // actual count is derived and not a fixed integer (i.e., when isFixedCount()==false) + public static final int VARIABLE_COUNT = -1; + + public boolean isFixedCount() { return this.equals(INTEGER); } + + /** + * Decode a header line count string and return the corresponding VCFHeaderLineCount enum value. + * If the value is not recognized as a valid constant, assume the string represents a fixed, numeric + * value, and return Integer. The caller should convert and validate the actual value. + * + * @param vcfVersion + * @param countTypeString + * @return + */ + protected static VCFHeaderLineCount decode(final VCFHeaderVersion vcfVersion, final String countTypeString) { + Utils.nonNull(vcfVersion); + Utils.nonNull(countTypeString); + + if (countTypeString.equals(VCFConstants.PER_ALTERNATE_COUNT)) { + return A; + } else if (countTypeString.equals(VCFConstants.PER_ALLELE_COUNT)) { + return R; + } else if (countTypeString.equals(VCFConstants.PER_GENOTYPE_COUNT)) { + return G; + } else if ( + (vcfVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) && countTypeString.equals(VCFConstants.UNBOUNDED_ENCODING_v4)) || + (!vcfVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) && countTypeString.equals(VCFConstants.UNBOUNDED_ENCODING_v3))) { + return VCFHeaderLineCount.UNBOUNDED; + } else { + return VCFHeaderLineCount.INTEGER; // assume integer + } + } + + /** + * Encode a count type as a string suitable for serialization to a VCF header. Note this is + * not version aware and defaults to VCFv4 format. + * + * @param actualCount Must be the special value {@code VARIABLE_COUNT} unless this object is {@code VCFHeaderLineCount.INTEGER}. + * @return String encoding of this enum, or the {@code actualCount} if the type of this count + * is VCFHeaderLineCount.INTEGER. + * + * @throws IllegalArgumentException if {@code actualCount} is not the special value {@code VARIABLE_COUNT} and this + * is not the {@code VCFHeaderLineCount.INTEGER} enum object. + */ + public String encode(final int actualCount) { + if (this != INTEGER && actualCount != VARIABLE_COUNT) { + // Should only supply an actualCount if the count type == INTEGER + throw new IllegalArgumentException("Inconsistent header line number encoding request"); + } + switch (this) { + case A: + return VCFConstants.PER_ALTERNATE_COUNT; + case R: + return VCFConstants.PER_ALLELE_COUNT; + case G: + return VCFConstants.PER_GENOTYPE_COUNT; + case UNBOUNDED: + return VCFConstants.UNBOUNDED_ENCODING_v4; + case INTEGER: + return Integer.toString(actualCount); + } + throw new IllegalStateException("Unexpected VCFHeaderLineCount enum value"); + } + } diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineTranslator.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineTranslator.java index 3ac72b28c6..f8e6a739e3 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineTranslator.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineTranslator.java @@ -26,6 +26,7 @@ package htsjdk.variant.vcf; import htsjdk.tribble.TribbleException; +import htsjdk.utils.Utils; import java.util.HashMap; import java.util.LinkedHashMap; @@ -33,39 +34,57 @@ import java.util.Map; /** - * A class for translating between vcf header versions + * A class for translating between vcf header versions and corresponding header line parsers. */ public class VCFHeaderLineTranslator { private static Map mapping; static { - mapping = new HashMap(); + mapping = new HashMap<>(); mapping.put(VCFHeaderVersion.VCF4_0,new VCF4Parser()); mapping.put(VCFHeaderVersion.VCF4_1,new VCF4Parser()); mapping.put(VCFHeaderVersion.VCF4_2,new VCF4Parser()); + mapping.put(VCFHeaderVersion.VCF4_3,new VCF4Parser()); mapping.put(VCFHeaderVersion.VCF3_3,new VCF3Parser()); mapping.put(VCFHeaderVersion.VCF3_2,new VCF3Parser()); } + /** + * Parse a VCFHeaderLine for the given version. + * + * @param version VCFHeaderVersion of the header line + * @param valueLine the header line string + * @param expectedTagOrder List of expected tags (interpreted differently by the VCF3 and VCF4 parsers). + * @return a mapping of the tags parsed out + */ public static Map parseLine(VCFHeaderVersion version, String valueLine, List expectedTagOrder) { - return mapping.get(version).parseLine(valueLine,expectedTagOrder); + return mapping.get(version).parseLine(valueLine, expectedTagOrder); } } - +/** + * Parse a VCFHeaderLine. + */ interface VCFLineParser { public Map parseLine(String valueLine, List expectedTagOrder); } - /** * a class that handles the to and from disk for VCF 4 lines */ class VCF4Parser implements VCFLineParser { + /** - * parse a VCF4 line - * @param valueLine the line - * @return a mapping of the tags parsed out + * Parse a VCFHeaderLine. The expectedTagOrder list prescribes the order in which tags should appear, but + * all tags are treated as optional. Additional tags are allowed after the expected tags, and may appear in + * any order. It is the caller's responsibility to validate that all required tags are present and that + * any additional "optional" tags are valid. + * + * @param valueLine the header line string + * @param expectedTagOrder List of tags that are required to appear in the order they're expected. Additional + * "extra" tags are allowed after the tags in this list, and must be validated by + * the caller. + * @return a mapping of all tags parsed out */ @Override public Map parseLine(String valueLine, List expectedTagOrder) { @@ -129,17 +148,22 @@ public Map parseLine(String valueLine, List expectedTagO throw new TribbleException.InvalidHeader("Unclosed quote in header line value " + valueLine); } - // validate the tags against the expected list - index = 0; + // Validate the order of all discovered tags against requiredTagOrder. All tags are treated as + // "optional". Succeeding does not mean that all expected tags in the list were seen. Also, all + // structured header lines can have "extra" tags, with no order specified, so additional tags + // are tolerated. if ( expectedTagOrder != null ) { - if ( ret.size() > expectedTagOrder.size() ) - throw new TribbleException.InvalidHeader("unexpected tag count " + ret.size() + " in line " + valueLine); - for ( String str : ret.keySet() ) { - if ( !expectedTagOrder.get(index).equals(str) ) - throw new TribbleException.InvalidHeader("Unexpected tag " + str + " in line " + valueLine); + index = 0; + for (String str : ret.keySet()) { + if (index >= expectedTagOrder.size()) { + break; // done - end of requiredTagOrder list + } else if (!expectedTagOrder.get(index).equals(str)) { + throw new TribbleException.InvalidHeader("Unexpected tag or tag order" + str + " in line " + valueLine); + } index++; } } + return ret; } } @@ -149,7 +173,7 @@ class VCF3Parser implements VCFLineParser { @Override public Map parseLine(String valueLine, List expectedTagOrder) { // our return map - Map ret = new LinkedHashMap(); + Map ret = new LinkedHashMap<>(); // a builder to store up characters as we go StringBuilder builder = new StringBuilder(); @@ -166,18 +190,32 @@ public Map parseLine(String valueLine, List expectedTagO for (char c: valueLine.toCharArray()) { switch (c) { case ('\"') : inQuote = !inQuote; break; // a quote means we ignore ',' in our strings, keep track of it - case (',') : if (!inQuote) { ret.put(expectedTagOrder.get(tagIndex++),builder.toString()); builder = new StringBuilder(); break; } // drop the current key value to the return map + case (',') : + if (!inQuote) { + ret.put(expectedTagOrder.get(tagIndex++),builder.toString()); + builder = new StringBuilder(); + break; + } // drop the current key value to the return map default: builder.append(c); // otherwise simply append to the current string } index++; } ret.put(expectedTagOrder.get(tagIndex++),builder.toString()); - // validate the tags against the expected list + // Validate that: + // we have no more tags than are expected + // the ones we have are in the expected list + // they appear in the same order as in the expected list + // This does no checking for missing tags; all tags are treated as optional + // index = 0; - if (tagIndex != expectedTagOrder.size()) throw new IllegalArgumentException("Unexpected tag count " + tagIndex + ", we expected " + expectedTagOrder.size()); + if (tagIndex != expectedTagOrder.size()) { + throw new IllegalArgumentException("Unexpected tag count " + tagIndex + ", we expected " + expectedTagOrder.size()); + } for (String str : ret.keySet()){ - if (!expectedTagOrder.get(index).equals(str)) throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine); + if (!expectedTagOrder.get(index).equals(str)) { + throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine); + } index++; } return ret; diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineType.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineType.java index 785449de89..6c37f1b48c 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineType.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineType.java @@ -25,9 +25,37 @@ package htsjdk.variant.vcf; +import htsjdk.utils.Utils; + /** * the type encodings we use for fields in VCF header lines */ public enum VCFHeaderLineType { - Integer, Float, String, Character, Flag; + Integer, + Float, + String, + Character, + Flag; + + /** + * Decode a header line count string and return the corresponding VCFHeaderLineCount enum value. + * If the value is not recognized as a valid constant, we assume the string represents a numeric + * value and return Integer. The caller should convert and validate the value. + * + * @param lineTypeString + * @return VCFHeaderLineType for {@code lineTypeString} + */ + protected static VCFHeaderLineType decode(final String lineTypeString) { + Utils.nonNull(lineTypeString); + return VCFHeaderLineType.valueOf(lineTypeString); + } + + /** + * Encode this line type as a string suitable for serialization to a VCF header. Note this is + * not version specific and defaults to VCFv42. + * + * The serialized encoding is the simple name of the enum constant + * @return string encoding of this line type + */ + String encode() { return this.toString(); } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java index b45d4230df..25c7eb79fd 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java @@ -26,16 +26,18 @@ package htsjdk.variant.vcf; import htsjdk.tribble.TribbleException; +import htsjdk.utils.Utils; /** * information that identifies each header version */ public enum VCFHeaderVersion { - VCF3_2("VCRv3.2","format"), + VCF3_2("VCRv3.2","format"), // the string "format" is the lower case version of the key used for FORMAT lines... VCF3_3("VCFv3.3","fileformat"), VCF4_0("VCFv4.0","fileformat"), VCF4_1("VCFv4.1","fileformat"), - VCF4_2("VCFv4.2","fileformat"); + VCF4_2("VCFv4.2","fileformat"), + VCF4_3("VCFv4.3","fileformat"); private final String versionString; private final String formatString; @@ -45,7 +47,7 @@ public enum VCFHeaderVersion { * @param vString the version string * @param fString the format string */ - VCFHeaderVersion(String vString, String fString) { + VCFHeaderVersion(String vString, String fString) { this.versionString = vString; this.formatString = fString; } @@ -96,6 +98,13 @@ public static VCFHeaderVersion getHeaderVersion(String versionLine) { return toHeaderVersion(lineFields[1]); } + /** + * @return A VCF fileformat=version metadata string for this version. + */ + public String getVersionLine() { + return String.format("%s=%s", getFormatString(), getVersionString()); + } + /** * Utility function to clean up a VCF header string * @@ -114,10 +123,12 @@ private static String clean(String s) { */ public boolean isAtLeastAsRecentAs(final VCFHeaderVersion target) { switch (target) { + case VCF4_3: + return this == VCF4_3; case VCF4_2: - return this == VCF4_2; + return this == VCF4_2 || this == VCF4_3; case VCF4_1: - return this == VCF4_1 || this == VCF4_2; + return this == VCF4_1 || this == VCF4_2 || this == VCF4_3; case VCF4_0: return this != VCF3_2 && this != VCF3_3; case VCF3_3: @@ -128,6 +139,18 @@ public boolean isAtLeastAsRecentAs(final VCFHeaderVersion target) { } } + /** + * Determine if twoheader versions are compatible. For now, the only incompatibility is between V4.3 + * and any other version. All other versions are compatible. + * @param v1 + * @param v2 + * @return + */ + public static boolean versionsAreCompatible(final VCFHeaderVersion v1, final VCFHeaderVersion v2) { + return !v1.equals(v2) && + (v1.isAtLeastAsRecentAs(VCF4_3) || v2.isAtLeastAsRecentAs(VCF4_3)); + } + public String getVersionString() { return versionString; } @@ -135,4 +158,5 @@ public String getVersionString() { public String getFormatString() { return formatString; } + } diff --git a/src/main/java/htsjdk/variant/vcf/VCFIDHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFIDHeaderLine.java deleted file mode 100644 index 246f89eea9..0000000000 --- a/src/main/java/htsjdk/variant/vcf/VCFIDHeaderLine.java +++ /dev/null @@ -1,31 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package htsjdk.variant.vcf; - -/** an interface for ID-based header lines **/ -public interface VCFIDHeaderLine { - String getID(); -} diff --git a/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java index afa1f8141a..f17351400b 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java @@ -26,32 +26,49 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.util.Log; +import htsjdk.tribble.TribbleException; + /** - * @author ebanks *

* Class VCFInfoHeaderLine *

*

- * A class representing a key=value entry for INFO fields in the VCF header + * A class representing an INFO field in the VCF header *

*/ public class VCFInfoHeaderLine extends VCFCompoundHeaderLine { - public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description) { - super(name, count, type, description, SupportedHeaderLineType.INFO); - } + private static final long serialVersionUID = 1L; + + protected final static Log logger = Log.getInstance(VCFFormatHeaderLine.class); public VCFInfoHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description) { - super(name, count, type, description, SupportedHeaderLineType.INFO); + super(VCFConstants.INFO_HEADER_KEY, name, count, type, description); + } + + public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description) { + super(VCFConstants.INFO_HEADER_KEY, name, count, type, description); } public VCFInfoHeaderLine(String line, VCFHeaderVersion version) { - super(line, version, SupportedHeaderLineType.INFO); + super(VCFConstants.INFO_HEADER_KEY, + VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrder), + version + ); + validateForVersion(version); } - // info fields allow flag values + /** + * Return true if the attribute name requires quotes. 4.3 spec requires info attributes DESCRIPTION, SOURCE + * and VERSION to be quoted. + * @param attributeName name of the attribute being serialized + * @return boolean indicating whether the value should be embedded n quotes during serialization + */ @Override - boolean allowFlagValues() { - return true; + protected boolean getIsQuotableAttribute(final String attributeName) { + return attributeName.equals(DESCRIPTION_ATTRIBUTE) || + attributeName.equals(SOURCE_ATTRIBUTE) || + attributeName.equals(VERSION_ATTRIBUTE); } @Override diff --git a/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java b/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java new file mode 100644 index 0000000000..884fe6c147 --- /dev/null +++ b/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java @@ -0,0 +1,298 @@ +package htsjdk.variant.vcf; + +import htsjdk.samtools.util.Log; +import htsjdk.tribble.TribbleException; + +import java.io.Serializable; +import java.util.*; +import java.util.stream.Collectors; + +//TODO: should this class be public so consumers can use in place of Set + +/** + * Class for managing a set of VCFHeaderLines for a VCFHeader. + */ +class VCFMetaDataLines implements Serializable { + public static final long serialVersionUID = 1L; + protected final static Log logger = Log.getInstance(VCFMetaDataLines.class); + + // Require a unique key for each ID line + // Allow duplicate non-ID keys, unless fileformat, reference ? + + // TODO: Should we reject attempts to add two contig header lines with the same contigIndex ? + // TODO: GATK VcfUtilsUnitTest.createHeaderLines test creates headers with contig lines with identical (0) indices + private final Map mMetaData = new LinkedHashMap<>(); + + private static String KEY_SEPARATOR = ":"; + + // Namespace key used for "other" (unstructured, non-ID) metadata lines. This string needs to be different from + // any string in the set of legal structured line types in knownStructuredLineKeys. + private static String OTHER_KEY = "OTHER"; + + /** + * Add all metadata lines from Set. If a duplicate line is encountered (same key/ID pair for + * structured lines, or duplicate content for unstructured lines with identical keys) + * only the first line will be retained. + * + * @param newMetaData Set of lines to be added to the list. + * @throws IllegalArgumentException if a fileformat line is added + */ + public void addAllMetaDataLines(Set newMetaData) { + newMetaData.stream().forEach(hl -> addMetaDataLine(hl)); + } + + /** + * Add a metadata line to the list. If a duplicate line is encountered (same key/ID pair for + * structured lines, or duplicate content for unstructured lines with identical keys) + * only the first line will be retained. + * + * @param headerLine header line to attempt to add + * @throws IllegalArgumentException if a fileformat line is added + */ + public void addMetaDataLine(final VCFHeaderLine headerLine) { + String key = makeKeyForLine(headerLine); + if ( mMetaData.get(key) != null ) { + final String message = String.format( + "Attempt to add header line (%s) collides with existing line header line (%s). " + + "The existing line will be retained", + mMetaData.get(key), + headerLine); + //TODO: should this also detect/reject multiple "assembly" or "reference" lines ? + if (VCFHeaderVersion.isFormatString(headerLine.getKey())) { // Throw if there is more than one fileformat line + throw new TribbleException(message); + } else if (VCFUtils.getVerboseVCFLogging()) { + // TODO: The previous header implementation would round trip lines with duplicate IDs by preserving + // TODO: them in the master header line list maintained by VCFHeader, but would silently drop them + // TODO: from the typed header line lists, so the duplicates would not surface in queries (i.e. via + // TODO: getInfoHeaderLines()). + // + // TODO: This implementation doesn't allow the duplicates to be preserved (the 4.2/4.3 specs expressly + // TODO: forbid it), so they're dropped here. There are several Picard and GATK test files around with + // TODO: duplicate ID lines, which will have to be modified in order for the tests to pass. + // + // TODO: Is it sufficient to log a warning for this case (duplicate key) ? Should we add code + // TODO: here to throw only for v4.3+ ? Only if VCFUtils.getStrictVCFVersionValidation is set ? Or both ? + logger.warn(message); + } + } else { + mMetaData.put(key, headerLine); + } + } + + /** + * Generate a unique key for a VCFHeaderLine. If the header line is a VCFStructuredHeaderLine, the key is the + * concatenation of the VCFHeaderLine's key (i.e., the type of the VCFHeaderLine) and the ID for that + * VCFHeaderLine (with a ":" separator). Otherwise, we use the concatenation of the OTHER_KEY, the VCFHeaderLine's + * key, and a nonce value to ensure that unstructured lines never collide with structured lines, and also can + * have duplicate identical instances. + * + * @param headerLine + * @return + */ + private String makeKeyForLine(final VCFHeaderLine headerLine) { + if (headerLine.isStructuredHeaderLine()) { // required to have a unique ID + // use the line type as the namespace, to ensure unique key/id combination + return makeKey(headerLine.getKey(), headerLine.getID()); + } else { + // Allow duplicate unstructured "other" keys, as long as they have different values. Prepend + // the string "OTHER" to prevent a non-structured line from having a key that overlaps a key + // key for a "known" structured line type, such as: + // + // ##FORMAT:bar=... + // + // which would overlap with the key generated for a real FORMAT line with id=bar. + // + // TODO: The previous implementation dropped duplicate keys for unstructured lines, but I don't think + // the spec requires these to be unique. This is more permissive in that it allows duplicate lines such + // as ##GATKCommandLines to accumulate if they have different values, but retains only one if it has + // a unique value. + return makeKey(OTHER_KEY, headerLine.getKey() + Integer.toString(headerLine.hashCode())); + } + } + + // Create a VCFHeaderLine hashmap key given a key and an id + private String makeKey(final String nameSpace, final String id) { return nameSpace + KEY_SEPARATOR + id; } + + /** + * Return the existing line if the list already contains a header line of the same type/id. + * @param line + * @return The eixsting header line of this type/key, otherwise NULL. + */ + public VCFHeaderLine hasEquivalentHeaderLine(final VCFHeaderLine line) { + return mMetaData.get(makeKeyForLine(line)); + } + + /** + * Remove the headerline matching this headerline (determined by makeKeyForLine) if it exists + * @param headerLine + * @return The headerline removed, or null of no headerline with a matching was found + */ + public VCFHeaderLine removeHeaderLine(final VCFHeaderLine headerLine) { + return mMetaData.remove(makeKeyForLine(headerLine)); + } + + /** + * Starting at version 4.3, validate all metadata lines against a target version. + * @param targetVersion + */ + public void validateMetaDataLines(final VCFHeaderVersion targetVersion) { + mMetaData.values().forEach(line -> validateMetaDataLine(targetVersion, line)); + } + + /** + * Starting at version 4.3, validate all metadata lines against the target version, + * including the case where the headerLine is itself a fileformat line with a version, in case + * it clashes. + * + * @param targetVersion + * @param vcfHeaderLine + */ + public void validateMetaDataLine(final VCFHeaderVersion targetVersion, final VCFHeaderLine vcfHeaderLine) { + vcfHeaderLine.validateForVersion(targetVersion); + } + + /** + * get the meta data, associated with this header, in sorted order + * + * @return a set of the meta data + */ + public Set getMetaDataInInputOrder() { + return Collections.unmodifiableSet(new LinkedHashSet(mMetaData.values())); + } + + public Set getMetaDataInSortedOrder() { + return Collections.unmodifiableSet(new TreeSet<>(mMetaData.values())); + } + + /** + * @return all of the structured (ID) lines in their original file order, or an empty list if none were present + */ + public List getStructuredHeaderLines() { + return mMetaData.values().stream() + .filter(hl -> hl.isStructuredHeaderLine()) + .collect(Collectors.toCollection(ArrayList::new)); + } + + /** + * @return all of the VCF FILTER lines in their original file order, or an empty list if none were present + */ + public List getFilterLines() { + return mMetaData.values().stream() + .filter(hl -> hl.getKey().equals(VCFConstants.FILTER_HEADER_KEY)) + .map(hl -> (VCFFilterHeaderLine) hl) + .collect(Collectors.toCollection(ArrayList::new)); + } + + /** + * @return all of the VCF header lines of the ##contig form in order, or an empty list if none were present + */ + public List getContigLines() { + List contigLines = new ArrayList<>(); + mMetaData.values().stream() + .filter(hl -> hl.getKey().equals(VCFConstants.CONTIG_HEADER_KEY)) + .forEach(hl -> contigLines.add((VCFContigHeaderLine) hl)); + return Collections.unmodifiableList(contigLines); + } + + //TODO: Is this useful ? It returns the first match for the given key, no matter how many + //TODO: there are. Should we deprecate it (and add a new one that returns a collection) + //TODO: or just change this one to return a collection ? + /** + * Get the VCFHeaderLine whose key equals key. Returns null if no such line exists + * @param key + * @return + */ + public VCFHeaderLine getMetaDataLine(final String key) { + return mMetaData.values().stream() + .filter(hl -> hl.getKey().equals(key)) + .findFirst() + .orElseGet(()->null); + } + + /** + * Returns the INFO HeaderLines in their original ordering + */ + public Collection getInfoHeaderLines() { + return mMetaData.values().stream() + .filter(hl -> hl.getKey().equals(VCFConstants.INFO_HEADER_KEY)) + .map(hl -> (VCFInfoHeaderLine) hl) + .collect(Collectors.toCollection(ArrayList::new)); + } + + /** + * Returns the FORMAT HeaderLines in their original ordering + */ + public Collection getFormatHeaderLines() { + return mMetaData.values().stream() + .filter(hl -> hl.getKey().equals(VCFConstants.FORMAT_HEADER_KEY)) + .map(hl -> (VCFFormatHeaderLine) hl) + .collect(Collectors.toCollection(ArrayList::new)); + } + + /** + * @param id the id of the requested header line + * @return the meta data line, or null if there is none + */ + public VCFInfoHeaderLine getInfoHeaderLine(final String id) { + return (VCFInfoHeaderLine) mMetaData.get(makeKey(VCFConstants.INFO_HEADER_KEY, id)); + } + + /** + * @param id the id of the requested header line + * @return the meta data line, or null if there is none + */ + public VCFFormatHeaderLine getFormatHeaderLine(final String id) { + return (VCFFormatHeaderLine) mMetaData.get(makeKey(VCFConstants.FORMAT_HEADER_KEY, id)); + } + + /** + * @param id the id of the requested header line + * @return the meta data line, or null if there is none + */ + public VCFFilterHeaderLine getFilterHeaderLine(final String id) { + return (VCFFilterHeaderLine) mMetaData.get(makeKey(VCFConstants.FILTER_HEADER_KEY, id)); + } + + //TODO: Is this useful ? It returns the first match for the given key, no matter how many + //TODO: there are. Should we deprecate it (and add a new one that returns a collection) + //TODO: or just change this one to return a collection ? + /** + * @param key the of the requested other header line + * @return the meta data line, or null if there is none + */ + public VCFHeaderLine getOtherHeaderLine(final String key) { + Iterator it = getOtherHeaderLines().iterator(); + while (it.hasNext()) { + VCFHeaderLine next = it.next(); + if (next.getKey().equals(key)) { + return next; + } + } + return null; + } + + /** + * Returns the other HeaderLines in their original ordering, where "other" means any + * VCFHeaderLine that is not a contig, info, format or filter header line. + */ + public Collection getOtherHeaderLines() { + return mMetaData.values().stream().filter( + hl -> + !hl.getKey().equals(VCFConstants.CONTIG_HEADER_KEY) && + !hl.getKey().equals(VCFConstants.INFO_HEADER_KEY) && + !hl.getKey().equals(VCFConstants.FILTER_HEADER_KEY) && + !hl.getKey().equals(VCFConstants.FORMAT_HEADER_KEY) + ) + .collect(Collectors.toCollection(ArrayList::new)); + } + + @Override + public String toString() { + final StringBuilder b = new StringBuilder(); + b.append("[VCFHeader:"); + for ( final VCFHeaderLine line : mMetaData.values() ) + b.append("\n\t").append(line); + return b.append("\n]").toString(); + } +} + diff --git a/src/main/java/htsjdk/variant/vcf/VCFMetaHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFMetaHeaderLine.java new file mode 100644 index 0000000000..cee303bb4e --- /dev/null +++ b/src/main/java/htsjdk/variant/vcf/VCFMetaHeaderLine.java @@ -0,0 +1,40 @@ +package htsjdk.variant.vcf; + +import htsjdk.tribble.TribbleException; + +import java.util.Collections; +import java.util.Map; + +/** + * A class representing META fields in the VCF header + */ +public class VCFMetaHeaderLine extends VCFStructuredHeaderLine { + private static final long serialVersionUID = 1L; + + public VCFMetaHeaderLine(final String line, final VCFHeaderVersion version) { + // We need to call the V4 parser directly since the V3 parser requires expected tags; validateForVersion + // will detect the version incompatibility if we're called on behalf of V3 + super(VCFConstants.META_HEADER_KEY, new VCF4Parser().parseLine(line, null)); + validateForVersion(version); + } + + public VCFMetaHeaderLine(final Map mapping) { + super(VCFConstants.META_HEADER_KEY, mapping); + } + + /** + * Validate that this header line conforms to the target version. + */ + @Override + public void validateForVersion(final VCFHeaderVersion vcfTargetVersion) { + if (!vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { + throw new TribbleException.InvalidHeader( + String.format("%s header lines are not allowed in VCF version %s headers", + getKey(), + vcfTargetVersion.toString()) + ); + } + super.validateForVersion(vcfTargetVersion); + } + +} diff --git a/src/main/java/htsjdk/variant/vcf/VCFPedigreeHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFPedigreeHeaderLine.java new file mode 100644 index 0000000000..5120325664 --- /dev/null +++ b/src/main/java/htsjdk/variant/vcf/VCFPedigreeHeaderLine.java @@ -0,0 +1,53 @@ +package htsjdk.variant.vcf; + +import htsjdk.tribble.TribbleException; + +import java.util.Collections; +import java.util.Map; + +/** + * A class representing PEDIGREE fields in the VCF header + * + * ##PEDIGREE= + * ##PEDIGREE= + * ##PEDIGREE= + * ##PEDIGREE= + * or a link to a database: ##pedigreeDB=URL + */ +public class VCFPedigreeHeaderLine extends VCFStructuredHeaderLine { + + private static final long serialVersionUID = 1L; + + public VCFPedigreeHeaderLine(String line, VCFHeaderVersion version) { + + // TODO: There are quite a few variants for expected tags (See above comment). Should we try to validate these? + // TODO: IF not, we don't really need to model PEDIGREE as a separate class ? + // We need to call the V4 parser directly since the V3 parser requires expected tags; validateForVersion + // will detect the version incompatibility if we're called on behalf of V3 + super(VCFConstants.PEDIGREE_HEADER_KEY, new VCF4Parser().parseLine(line, null)); + validateForVersion(version); + } + + public VCFPedigreeHeaderLine(final Map mapping) { + super(VCFConstants.PEDIGREE_HEADER_KEY, mapping); + } + + /** + * Validate that this header line conforms to the target version. + */ + @Override + public void validateForVersion(final VCFHeaderVersion vcfTargetVersion) { + super.validateForVersion(vcfTargetVersion); + if (!vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { + final String message = String.format("%s header lines are not allowed in VCF version %s headers", + getKey(), + vcfTargetVersion.toString()); + if (VCFUtils.getStrictVCFVersionValidation()) { + throw new TribbleException.InvalidHeader(message); + } else { + logger.warn(message); + } + } + } + +} diff --git a/src/main/java/htsjdk/variant/vcf/VCFRecordCodec.java b/src/main/java/htsjdk/variant/vcf/VCFRecordCodec.java index 8fe9b67d6d..354496c4da 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFRecordCodec.java +++ b/src/main/java/htsjdk/variant/vcf/VCFRecordCodec.java @@ -28,7 +28,7 @@ public VCFRecordCodec(final VCFHeader header) { public VCFRecordCodec(final VCFHeader header, final boolean allowMissingFieldsInHeader) { this.vcfEncoder = new VCFEncoder(header, allowMissingFieldsInHeader, false); // Explicitly set the version because it's not available in the header itself. - this.vcfDecoder.setVCFHeader(header, VCFHeaderVersion.VCF4_2); + this.vcfDecoder.setVCFHeader(header, VCFHeader.DEFAULT_VCF_VERSION); } @Override diff --git a/src/main/java/htsjdk/variant/vcf/VCFSampleHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFSampleHeaderLine.java new file mode 100644 index 0000000000..b6689d4e61 --- /dev/null +++ b/src/main/java/htsjdk/variant/vcf/VCFSampleHeaderLine.java @@ -0,0 +1,46 @@ +package htsjdk.variant.vcf; + +import htsjdk.tribble.TribbleException; +import htsjdk.utils.Utils; + +import java.util.Collections; +import java.util.Map; + +/** + * TODO: these are not well defined in the VCF 4.3 spec; they appear to require IDs, + * TODO: and have lots of possible attributes + */ +public class VCFSampleHeaderLine extends VCFStructuredHeaderLine { + + private static final long serialVersionUID = 1L; + + public VCFSampleHeaderLine(String line, VCFHeaderVersion version) { + // We need to call the V4 parser directly since the V3 parser requires expected tags; validateForVersion + // will detect the version incompatibility if we're called on behalf of V3 + super(VCFConstants.SAMPLE_HEADER_KEY, new VCF4Parser().parseLine(line, null)); + validateForVersion(version); + } + + public VCFSampleHeaderLine(final Map mapping) { + super(VCFConstants.SAMPLE_HEADER_KEY, mapping); + } + + /** + * Validate that this header line conforms to the target version. + */ + @Override + public void validateForVersion(final VCFHeaderVersion vcfTargetVersion) { + super.validateForVersion(vcfTargetVersion); + if (!vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0)) { + final String message = String.format("%s header lines are not allowed in VCF version %s headers", + getKey(), + vcfTargetVersion.toString()); + if (VCFUtils.getStrictVCFVersionValidation()) { + throw new TribbleException.InvalidHeader(message); + } else { + logger.warn(message); + } + } + } + +} diff --git a/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java deleted file mode 100644 index 1c36f9e956..0000000000 --- a/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java +++ /dev/null @@ -1,129 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package htsjdk.variant.vcf; - -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; - - -/** - * @author ebanks - * - * A class representing a key=value entry for simple VCF header types - */ -public class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFIDHeaderLine { - - private String name; - private Map genericFields = new LinkedHashMap(); - - /** - * create a VCF filter header line - * - * @param key the key for this header line - * @param name the name for this header line - * @param description description for this header line - */ - public VCFSimpleHeaderLine(String key, String name, String description) { - super(key, ""); - Map map = new LinkedHashMap(1); - map.put("Description", description); - initialize(name, map); - } - - /** - * create a VCF info header line - * - * @param line the header line - * @param version the vcf header version - * @param key the key for this header line - * @param expectedTagOrdering the tag ordering expected for this header line - */ - public VCFSimpleHeaderLine(final String line, final VCFHeaderVersion version, final String key, final List expectedTagOrdering) { - this(key, VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrdering)); - } - - public VCFSimpleHeaderLine(final String key, final Map mapping) { - super(key, ""); - name = mapping.get("ID"); - initialize(name, mapping); - } - - /** - * Returns the String value associated with the given key. Returns null if there is no value. Key - * must not be null. - */ - String getGenericFieldValue(final String key) { - return this.genericFields.get(key); - } - - protected void initialize(String name, Map genericFields) { - if ( name == null || genericFields == null || genericFields.isEmpty() ) - throw new IllegalArgumentException(String.format("Invalid VCFSimpleHeaderLine: key=%s name=%s", super.getKey(), name)); - if ( name.contains("<") || name.contains(">") ) - throw new IllegalArgumentException("VCFHeaderLine: ID cannot contain angle brackets"); - if ( name.contains("=") ) - throw new IllegalArgumentException("VCFHeaderLine: ID cannot contain an equals sign"); - - this.name = name; - this.genericFields.putAll(genericFields); - } - - @Override - protected String toStringEncoding() { - Map map = new LinkedHashMap(); - map.put("ID", name); - map.putAll(genericFields); - return getKey() + "=" + VCFHeaderLine.toStringEncoding(map); - } - - @Override - public boolean equals( final Object o ) { - if ( this == o ) { - return true; - } - if ( o == null || getClass() != o.getClass() || ! super.equals(o) ) { - return false; - } - - final VCFSimpleHeaderLine that = (VCFSimpleHeaderLine) o; - return name.equals(that.name) && - genericFields.equals(that.genericFields); - } - - @Override - public int hashCode() { - int result = super.hashCode(); - result = 31 * result + name.hashCode(); - result = 31 * result + genericFields.hashCode(); - return result; - } - - @Override - public String getID() { - return name; - } -} \ No newline at end of file diff --git a/src/main/java/htsjdk/variant/vcf/VCFStandardHeaderLines.java b/src/main/java/htsjdk/variant/vcf/VCFStandardHeaderLines.java index de2817c96b..038a977b9e 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFStandardHeaderLines.java +++ b/src/main/java/htsjdk/variant/vcf/VCFStandardHeaderLines.java @@ -52,26 +52,30 @@ public class VCFStandardHeaderLines { * Enabling this causes us to repair header lines even if only their descriptions differ. */ private final static boolean REPAIR_BAD_DESCRIPTIONS = false; - private static Standards formatStandards = new Standards(); - private static Standards infoStandards = new Standards(); + private static Standards formatStandards = new Standards<>(); + private static Standards infoStandards = new Standards<>(); /** * Walks over the VCF header and repairs the standard VCF header lines in it, returning a freshly * allocated {@link VCFHeader} with standard VCF header lines repaired as necessary. */ public static VCFHeader repairStandardHeaderLines(final VCFHeader header) { - final Set newLines = new LinkedHashSet(header.getMetaDataInInputOrder().size()); + final Set newLines = new LinkedHashSet<>(header.getMetaDataInInputOrder().size()); for ( VCFHeaderLine line : header.getMetaDataInInputOrder() ) { if ( line instanceof VCFFormatHeaderLine ) { line = formatStandards.repair((VCFFormatHeaderLine) line); } else if ( line instanceof VCFInfoHeaderLine) { line = infoStandards.repair((VCFInfoHeaderLine) line); } - newLines.add(line); } - return new VCFHeader(newLines, header.getGenotypeSamples()); + // TODO: this does not preserve any header state that is not captured by headerLines/sample-names/version + //NOTE that its possible for this to fail in the (probably rare) case that the repaired + //lines (which are "versionless") cannot pass validation against the header version + VCFHeader newHeader = new VCFHeader(header.getHeaderVersion(), newLines, header.getGenotypeSamples()); + + return newHeader; } /** @@ -152,25 +156,25 @@ private static void registerStandard(final VCFFormatHeaderLine line) { // static { // FORMAT lines - registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); - registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Integer, "Genotype Quality")); - registerStandard(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)")); + registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); + registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Integer, "Genotype Quality")); + registerStandard(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)")); registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, VCFHeaderLineCount.R, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed")); registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_FILTER_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Genotype-level filter")); - registerStandard(new VCFFormatHeaderLine(VCFConstants.PHASE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Read-backed phasing quality")); + registerStandard(new VCFFormatHeaderLine(VCFConstants.PHASE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Read-backed phasing quality")); // INFO lines - registerStandard(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth; some reads may have been filtered")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth; some reads may have been filtered")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias")); registerStandard(new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed")); registerStandard(new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.MAPPING_QUALITY_ZERO_KEY, 1, VCFHeaderLineType.Integer, "Total Mapping Quality Zero Reads")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.RMS_MAPPING_QUALITY_KEY, 1, VCFHeaderLineType.Float, "RMS Mapping Quality")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.SOMATIC_KEY, 0, VCFHeaderLineType.Flag, "Somatic event")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.MAPPING_QUALITY_ZERO_KEY, 1, VCFHeaderLineType.Integer, "Total Mapping Quality Zero Reads")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.RMS_MAPPING_QUALITY_KEY, 1, VCFHeaderLineType.Float, "RMS Mapping Quality")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.SOMATIC_KEY, 0, VCFHeaderLineType.Flag, "Somatic event")); } private static class Standards { @@ -186,6 +190,7 @@ public T repair(final T line) { final boolean needsRepair = badCountType || badCount || badType || (REPAIR_BAD_DESCRIPTIONS && badDesc); if ( needsRepair ) { + // TODO: Should we warn/log when we do this ? if ( GeneralUtils.DEBUG_MODE_ENABLED ) { System.err.println("Repairing standard header line for field " + line.getID() + " because" + (badCountType ? " -- count types disagree; header has " + line.getCountType() + " but standard is " + standard.getCountType() : "") diff --git a/src/main/java/htsjdk/variant/vcf/VCFStructuredHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFStructuredHeaderLine.java new file mode 100644 index 0000000000..0a1dce78f8 --- /dev/null +++ b/src/main/java/htsjdk/variant/vcf/VCFStructuredHeaderLine.java @@ -0,0 +1,219 @@ +/* +* Copyright (c) 2017 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package htsjdk.variant.vcf; + +import htsjdk.samtools.util.Log; +import htsjdk.tribble.TribbleException; +import htsjdk.utils.Utils; + +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static java.util.Collections.unmodifiableMap; + +/** + * An abstract class representing a VCF metadata line with a key and attribute=value pairs, one of + * which represents an ID. The key determines the "type" of the structured header line (i.e., contig, FILTER, + * INFO, ALT, PEDIGREE, META). + * + * The attribute/value pairs are ordered. The first entry in the map must be an ID attribute (used by the + * VCFHeader to ensure that no two structured header lines that share the same key in a given header have the + * same ID). + */ +public class VCFStructuredHeaderLine extends VCFHeaderLine { + private static final long serialVersionUID = 1L; + protected final static Log logger = Log.getInstance(VCFStructuredHeaderLine.class); + + public static String ID_ATTRIBUTE = "ID"; + public static String DESCRIPTION_ATTRIBUTE = "Description"; + public static String SOURCE_ATTRIBUTE = "Source"; + public static String VERSION_ATTRIBUTE = "Version"; + + // Map used to retain the attribute/value pairs, in original order. The first entry in the map must be + // an ID field. The entire map must be immutable to prevent hash values from changing, since these are + // often stored in Sets. Its not final to allow for special cases where subclasses have to be able to + // "repair" header lines (via a call to updateGenericField) during constructor validation. + // + // Otherwise the values here should never change during the lifetime of the header line. + private Map genericFields; + + public VCFStructuredHeaderLine(final String key, final String line, final VCFHeaderVersion version) { + // We don't use any expectedTagOrder, since the only required tag is ID. + this(key, VCFHeaderLineTranslator.parseLine(version, line, null)); + validate(); + validateForVersion(version); + } + + /** + * Key cannot be null or empty. + * + * @param key key to use for this header line. can not be null. + * @param id id name to use for this line + * @param description string that will be added as a "Description" tag to this line + */ + public VCFStructuredHeaderLine(final String key, final String id, final String description) { + super(key, ""); + genericFields = Collections.unmodifiableMap(new LinkedHashMap(){{ + put(ID_ATTRIBUTE, id); + put(DESCRIPTION_ATTRIBUTE, description); + }}); + validate(); + } + + /** + * Key cannot be null or empty. + * + * @param key key to use for this header line. can not be null. + * @param attributeMapping field mappings to use. may not be null. must contain an "ID" field to use as + * a unique id for this line + */ + public VCFStructuredHeaderLine(final String key, final Map attributeMapping) { + super(key, ""); + Utils.nonNull(attributeMapping, "An attribute map is required for structured header lines"); + genericFields = Collections.unmodifiableMap(new LinkedHashMap(attributeMapping)); + validate(); + } + + // Called by VCFInfoHeaderLine to allow repairing of VCFInfoLines that have a Flag type and a non-zero count + // (the combination of which is forbidden by the spec, but which we tolerate for backward compatibility with + // previous versions of htsjdk, which silently repaired these). + // + // Replaces the original generic fields map with another immutable map with the updated value. + protected void updateGenericField(final String attributeName, final String value) { + // a little inefficient, but this happens pretty rarely + final Map tempMap = new LinkedHashMap(genericFields); + tempMap.put(attributeName, value); + genericFields = Collections.unmodifiableMap(new LinkedHashMap(tempMap)); + } + + private void validate() { + if ( genericFields.isEmpty() || !genericFields.keySet().stream().findFirst().get().equals(ID_ATTRIBUTE)) { + throw new TribbleException( + String.format("The required ID tag is missing or not the first attribute: key=%s", super.getKey())); + } + validateAsID(getGenericFieldValue(ID_ATTRIBUTE), "ID"); + } + + /** + * @return true if this is a structured header line (has a unique ID and multiple key/value pairs), + * otherwise false + */ + @Override + public boolean isStructuredHeaderLine() { return true; } + + /** + * Return the unique ID for this line. Returns null iff isStructuredHeaderLine is false. + * @return + */ + @Override + public String getID() { + return getGenericFieldValue(ID_ATTRIBUTE); + } + + /** + * Returns the String value associated with the given key. Returns null if there is no value. Key + * must not be null. + */ + public String getGenericFieldValue(final String key) { + return this.genericFields.get(key); + } + + /** + * Returns a list of the names of all attributes for this line. + */ + Set getGenericFieldNames() { + return this.genericFields.keySet(); + } + + /** + * create a string of a mapping pair for the target VCF version + * @return a string, correctly formatted + */ + @Override + protected String toStringEncoding() { + //NOTE: this preserves/round-trips "extra" attributes such as SOURCE, VERSION, etc. + StringBuilder builder = new StringBuilder(); + builder.append(getKey()); + builder.append("=<"); + builder.append(genericFields.entrySet().stream() + .map(e -> e.getKey() + "=" + quoteAttributeValueForSerialization(e.getKey(), e.getValue())) + .collect(Collectors.joining(","))); + builder.append('>'); + return builder.toString(); + } + + // Add quotes around any attribute value that contains a space or comma, or is supposed to be quoted by + // definition per the spec (i.e., Description, Source, Version for INFO lines). + private String quoteAttributeValueForSerialization(final String attribute, final String originalValue) { + return originalValue.contains(",") || originalValue.contains(" ") || getIsQuotableAttribute(attribute) ? + "\""+ escapeQuotes(originalValue) + "\"" : + originalValue; + } + + /** + * Return true if the attribute name requires quotes. + * + * @param attributeName name of the attribute being serialized + * @return boolean indicating whether the value should be embedded in quotes during serialization + */ + protected boolean getIsQuotableAttribute(final String attributeName) { + // AFAICT, the spec only mentions that the DESCRIPTION attribute for info lines should be quoted, + // but the previous incarnations of htsjdk seem to do it for all DESCRIPTION attributes, so + // retain this for BWC + return attributeName.equals(DESCRIPTION_ATTRIBUTE); + } + + private static String escapeQuotes(final String value) { + // java escaping in a string literal makes this harder to read than it should be + // without string literal escaping and quoting the regex would be: replaceAll( ([^\])" , $1\" ) + // ie replace: something that's not a backslash ([^\]) followed by a double quote + // with: the thing that wasn't a backslash ($1), followed by a backslash, followed by a double quote + return value.replaceAll("([^\\\\])\"", "$1\\\\\""); + } + + @Override + public boolean equals( final Object o ) { + if ( this == o ) { + return true; + } + if ( o == null || getClass() != o.getClass() || ! super.equals(o) ) { + return false; + } + + final VCFStructuredHeaderLine that = (VCFStructuredHeaderLine) o; + return genericFields.equals(that.genericFields); + } + + @Override + public int hashCode() { + int result = super.hashCode(); + result = 31 * result + genericFields.hashCode(); + return result; + } +} \ No newline at end of file diff --git a/src/main/java/htsjdk/variant/vcf/VCFUtils.java b/src/main/java/htsjdk/variant/vcf/VCFUtils.java index c8eceeab57..10e576b160 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFUtils.java +++ b/src/main/java/htsjdk/variant/vcf/VCFUtils.java @@ -27,87 +27,29 @@ import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; -import htsjdk.variant.utils.GeneralUtils; +import htsjdk.tribble.TribbleException; import java.io.File; import java.util.ArrayList; import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; -import java.util.Map; import java.util.Set; -import java.util.TreeMap; public class VCFUtils { - public static Set smartMergeHeaders(final Collection headers, final boolean emitWarnings) throws IllegalStateException { - // We need to maintain the order of the VCFHeaderLines, otherwise they will be scrambled in the returned Set. - // This will cause problems for VCFHeader.getSequenceDictionary and anything else that implicitly relies on the line ordering. - final TreeMap map = new TreeMap(); // from KEY.NAME -> line - final HeaderConflictWarner conflictWarner = new HeaderConflictWarner(emitWarnings); - - // todo -- needs to remove all version headers from sources and add its own VCF version line - for ( final VCFHeader source : headers ) { - //System.out.printf("Merging in header %s%n", source); - for ( final VCFHeaderLine line : source.getMetaDataInSortedOrder()) { - - String key = line.getKey(); - if ( line instanceof VCFIDHeaderLine ) - key = key + "-" + ((VCFIDHeaderLine)line).getID(); - - if ( map.containsKey(key) ) { - final VCFHeaderLine other = map.get(key); - if ( line.equals(other) ) { - // continue; - } else if ( ! line.getClass().equals(other.getClass()) ) { - throw new IllegalStateException("Incompatible header types: " + line + " " + other ); - } else if ( line instanceof VCFFilterHeaderLine ) { - final String lineName = ((VCFFilterHeaderLine) line).getID(); - final String otherName = ((VCFFilterHeaderLine) other).getID(); - if ( ! lineName.equals(otherName) ) - throw new IllegalStateException("Incompatible header types: " + line + " " + other ); - } else if ( line instanceof VCFCompoundHeaderLine ) { - final VCFCompoundHeaderLine compLine = (VCFCompoundHeaderLine)line; - final VCFCompoundHeaderLine compOther = (VCFCompoundHeaderLine)other; - - // if the names are the same, but the values are different, we need to quit - if (! (compLine).equalsExcludingDescription(compOther) ) { - if ( compLine.getType().equals(compOther.getType()) ) { - // The Number entry is an Integer that describes the number of values that can be - // included with the INFO field. For example, if the INFO field contains a single - // number, then this value should be 1. However, if the INFO field describes a pair - // of numbers, then this value should be 2 and so on. If the number of possible - // values varies, is unknown, or is unbounded, then this value should be '.'. - conflictWarner.warn(line, "Promoting header field Number to . due to number differences in header lines: " + line + " " + other); - compOther.setNumberToUnbounded(); - } else if ( compLine.getType() == VCFHeaderLineType.Integer && compOther.getType() == VCFHeaderLineType.Float ) { - // promote key to Float - conflictWarner.warn(line, "Promoting Integer to Float in header: " + compOther); - map.put(key, compOther); - } else if ( compLine.getType() == VCFHeaderLineType.Float && compOther.getType() == VCFHeaderLineType.Integer ) { - // promote key to Float - conflictWarner.warn(line, "Promoting Integer to Float in header: " + compOther); - } else { - throw new IllegalStateException("Incompatible header types, collision between these two types: " + line + " " + other ); - } - } - if ( ! compLine.getDescription().equals(compOther.getDescription()) ) - conflictWarner.warn(line, "Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine); - } else { - // we are not equal, but we're not anything special either - conflictWarner.warn(line, "Ignoring header line already in map: this header line = " + line + " already present header = " + other); - } - } else { - map.put(key, line); - //System.out.printf("Adding header line %s%n", line); - } - } - } - // returning a LinkedHashSet so that ordering will be preserved. Ensures the contig lines do not get scrambled. - return new LinkedHashSet(map.values()); + //TODO: Once we settle on the uses for this, we should determine how it gets set. For now its static/global. + public static boolean VCF_STRICT_VERSION_VALIDATION = true; + public static boolean VCF_VERBOSE_LOGGING = true; + + public static boolean getStrictVCFVersionValidation() { return VCF_STRICT_VERSION_VALIDATION; } + public static boolean getVerboseVCFLogging() { return VCF_VERBOSE_LOGGING; } + + //TODO: NOTE: The old implementation of this code had side-effects due to mutation of some VCFCompoundHeaderLines + public static Set smartMergeHeaders( + final Collection headers, + final boolean emitWarnings) throws IllegalStateException { + return VCFHeader.getMergedHeaderLines(headers, emitWarnings); } /** @@ -129,7 +71,7 @@ public static Set withUpdatedContigsAsLines(final Set lines = new LinkedHashSet(oldLines.size()); for ( final VCFHeaderLine line : oldLines ) { - if ( line instanceof VCFContigHeaderLine ) + if ( line.isStructuredHeaderLine() && line.getKey().equals(VCFConstants.CONTIG_HEADER_KEY) ) continue; // skip old contig lines if ( line.getKey().equals(VCFHeader.REFERENCE_KEY) ) continue; // skip the old reference key @@ -161,19 +103,16 @@ public static Set withUpdatedContigsAsLines(final Set makeContigHeaderLines(final SAMSequenceDictionary refDict, final File referenceFile) { - final List lines = new ArrayList(); + final List lines = new ArrayList<>(); final String assembly = referenceFile != null ? getReferenceAssembly(referenceFile.getName()) : null; for ( final SAMSequenceRecord contig : refDict.getSequences() ) - lines.add(makeContigHeaderLine(contig, assembly)); + lines.add(new VCFContigHeaderLine(contig, assembly)); return lines; } + @Deprecated private static VCFContigHeaderLine makeContigHeaderLine(final SAMSequenceRecord contig, final String assembly) { - final Map map = new LinkedHashMap(3); - map.put("ID", contig.getSequenceName()); - map.put("length", String.valueOf(contig.getSequenceLength())); - if ( assembly != null ) map.put("assembly", assembly); - return new VCFContigHeaderLine(map, contig.getSequenceIndex()); + return new VCFContigHeaderLine(contig, assembly); } private static String getReferenceAssembly(final String refPath) { @@ -190,20 +129,4 @@ else if (refPath.contains("hg19")) return assembly; } - /** Only displays a warning if warnings are enabled and an identical warning hasn't been already issued */ - private static final class HeaderConflictWarner { - boolean emitWarnings; - Set alreadyIssued = new HashSet(); - - private HeaderConflictWarner( final boolean emitWarnings ) { - this.emitWarnings = emitWarnings; - } - - public void warn(final VCFHeaderLine line, final String msg) { - if ( GeneralUtils.DEBUG_MODE_ENABLED && emitWarnings && ! alreadyIssued.contains(line.getKey()) ) { - alreadyIssued.add(line.getKey()); - System.err.println(msg); - } - } - } } diff --git a/src/test/java/htsjdk/utils/UtilsUnitTest.java b/src/test/java/htsjdk/utils/UtilsUnitTest.java new file mode 100644 index 0000000000..bfedf0153a --- /dev/null +++ b/src/test/java/htsjdk/utils/UtilsUnitTest.java @@ -0,0 +1,30 @@ +package htsjdk.utils; + +import org.testng.Assert; +import org.testng.annotations.Test; + +public class UtilsUnitTest { + @Test(expectedExceptions = IllegalArgumentException.class) + public void testNonNullThrows(){ + final Object o = null; + Utils.nonNull(o); + } + + @Test + public void testNonNullDoesNotThrow(){ + final Object o = new Object(); + Assert.assertSame(Utils.nonNull(o), o); + } + + @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = "^The exception message$") + public void testNonNullWithMessageThrows() { + Utils.nonNull(null, "The exception message"); + } + + @Test + public void testNonNullWithMessageReturn() { + final Object testObject = new Object(); + Assert.assertSame(Utils.nonNull(testObject, "some message"), testObject); + } + +} diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java index 91804c48dc..1113d6edf5 100644 --- a/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java +++ b/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java @@ -34,9 +34,8 @@ import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineCount; import htsjdk.variant.vcf.VCFHeaderLineType; -import htsjdk.variant.vcf.VCFIDHeaderLine; import htsjdk.variant.vcf.VCFInfoHeaderLine; -import htsjdk.variant.vcf.VCFSimpleHeaderLine; +import htsjdk.variant.vcf.VCFStructuredHeaderLine; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -44,9 +43,12 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; +import static htsjdk.variant.vcf.VCFStructuredHeaderLine.DESCRIPTION_ATTRIBUTE; + /** * Tests for BCF2Utils */ @@ -79,18 +81,17 @@ public void testCreateDictionary() { inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); - inputLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - inputLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + inputLines.add(new VCFInfoHeaderLine(String.valueOf("A"+counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + inputLines.add(new VCFInfoHeaderLine(String.valueOf("A"+counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); inputLines.add(new VCFHeaderLine("x", "misc")); inputLines.add(new VCFHeaderLine("y", "misc")); - inputLines.add(new VCFSimpleHeaderLine("GATKCommandLine","z","misc")); - inputLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - inputLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - final int inputLineCounter = counter; - final VCFHeader inputHeader = new VCFHeader(new LinkedHashSet(inputLines)); + inputLines.add(new VCFFilterHeaderLine("aFilter", "misc")); + inputLines.add(new VCFFormatHeaderLine(String.valueOf("A"+counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + inputLines.add(new VCFFormatHeaderLine(String.valueOf("A"+counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + final VCFHeader inputHeader = new VCFHeader(new LinkedHashSet<>(inputLines)); final ArrayList dict = BCF2Utils.makeDictionary(inputHeader); final int dict_size = dict.size(); - Assert.assertEquals(7,dict_size); + Assert.assertEquals(8,dict_size); } /** @@ -179,8 +180,8 @@ public Object[][] makeHeaderOrderTestProvider() { private static boolean expectedConsistent(final VCFHeader combinationHeader, final int minCounterForInputLines) { final List ids = new ArrayList(); for ( final VCFHeaderLine line : combinationHeader.getMetaDataInInputOrder() ) { - if ( line instanceof VCFIDHeaderLine) { - ids.add(Integer.valueOf(((VCFIDHeaderLine) line).getID())); + if ( line.isStructuredHeaderLine()) { + ids.add(Integer.valueOf(line.getID())); } } diff --git a/src/test/java/htsjdk/variant/variantcontext/writer/VCFWriterUnitTest.java b/src/test/java/htsjdk/variant/variantcontext/writer/VCFWriterUnitTest.java index eaf2f95a10..dd878becc9 100644 --- a/src/test/java/htsjdk/variant/variantcontext/writer/VCFWriterUnitTest.java +++ b/src/test/java/htsjdk/variant/variantcontext/writer/VCFWriterUnitTest.java @@ -38,11 +38,7 @@ import htsjdk.variant.variantcontext.GenotypesContext; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; -import htsjdk.variant.vcf.VCFCodec; -import htsjdk.variant.vcf.VCFFileReader; -import htsjdk.variant.vcf.VCFHeader; -import htsjdk.variant.vcf.VCFHeaderLine; -import htsjdk.variant.vcf.VCFHeaderVersion; +import htsjdk.variant.vcf.*; import java.io.File; import java.io.FileNotFoundException; @@ -243,6 +239,7 @@ public void TestWritingLargeVCF(final String extension) throws FileNotFoundExcep @DataProvider(name = "vcfExtensionsDataProvider") public Object[][]vcfExtensionsDataProvider() { return new Object[][] { + //TODO: fix this BCF problem! // TODO: BCF doesn't work because header is not properly constructed. // {".bcf"}, {".vcf"}, diff --git a/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java b/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java index 9f81547ed2..72ed0e0664 100644 --- a/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java +++ b/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java @@ -1,5 +1,7 @@ package htsjdk.variant.vcf; +import htsjdk.tribble.AbstractFeatureReader; +import htsjdk.tribble.FeatureCodec; import htsjdk.tribble.TribbleException; import htsjdk.tribble.index.tabix.TabixFormat; import htsjdk.variant.VariantBaseTest; @@ -10,8 +12,8 @@ import org.testng.annotations.Test; import java.io.File; -import java.util.List; - +import java.io.IOException; +import java.util.Iterator; public class AbstractVCFCodecTest extends VariantBaseTest { @@ -27,14 +29,24 @@ public void shouldPreserveSymbolicAlleleCase() { Assert.assertTrue(variant.getAlternateAllele(0).getDisplayString().contains("chr12")); } - @Test - public void TestSpanDelParseAlleles(){ - List list = VCF3Codec.parseAlleles("A", Allele.SPAN_DEL_STRING, 0); + @DataProvider(name="AllVCFCodecs") + public Object[][] allVCFCodecVersions() { + return new Object[][] { + {new VCF3Codec() }, + {new VCFCodec() }, + //{new VCF43Codec ()} + }; + } + + @Test(dataProvider = "AllVCFCodecs") + public void TestSpanDelParseAlleles(final AbstractVCFCodec vcfCodec){ + // TODO: why is there no Assert here ?? + vcfCodec.parseAlleles("A", Allele.SPAN_DEL_STRING, 0); } - @Test(expectedExceptions = TribbleException.class) - public void TestSpanDelParseAllelesException(){ - List list1 = VCF3Codec.parseAlleles(Allele.SPAN_DEL_STRING, "A", 0); + @Test(dataProvider = "AllVCFCodecs", expectedExceptions = TribbleException.class) + public void TestSpanDelParseAllelesException(final AbstractVCFCodec vcfCodec){ + vcfCodec.parseAlleles(Allele.SPAN_DEL_STRING, "A", 0); } @DataProvider(name="thingsToTryToDecode") @@ -49,12 +61,41 @@ public Object[][] getThingsToTryToDecode(){ @Test(dataProvider = "thingsToTryToDecode") public void testCanDecodeFile(String potentialInput, boolean canDecode) { + //TODO: add VCF43Codec when available + //TODO: its not sufficient to test for ANY v4 prefix since it will succeed on 4.3 as well Assert.assertEquals(AbstractVCFCodec.canDecodeFile(potentialInput, VCFCodec.VCF4_MAGIC_HEADER), canDecode); } - @Test - public void testGetTabixFormat() { - Assert.assertEquals(new VCFCodec().getTabixFormat(), TabixFormat.VCF); - Assert.assertEquals(new VCF3Codec().getTabixFormat(), TabixFormat.VCF); + @Test(dataProvider = "AllVCFCodecs") + public void testGetTabixFormat(final AbstractVCFCodec vcfCodec) { + Assert.assertEquals(vcfCodec.getTabixFormat(), TabixFormat.VCF); + } + + @DataProvider(name="otherHeaderLines") + public Object[][] otherHeaderLines() { + return new Object[][] { + { "key=<", new VCFHeaderLine("key", "<") }, + }; + } + + @Test(dataProvider="otherHeaderLines") + public void testGetOtherHeaderLine(final String headerLineString, final VCFHeaderLine headerLine) { + Assert.assertEquals(new VCFCodec().getOtherHeaderLine(headerLineString, VCFHeaderVersion.VCF4_2), headerLine); + } + + @DataProvider(name="badOtherHeaderLines") + public Object[][] badOtherHeaderLines() { + return new Object[][] { + { "=" }, + { "=<" }, + { "=<>" }, + { "key" }, + }; } + + @Test(dataProvider="badOtherHeaderLines", expectedExceptions=TribbleException.InvalidHeader.class) + public void testBadOtherHeaderLine(final String headerLineString) { + Assert.assertNull(new VCFCodec().getOtherHeaderLine(headerLineString, VCFHeaderVersion.VCF4_2)); + } + } diff --git a/src/test/java/htsjdk/variant/vcf/VCFAltHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFAltHeaderLineUnitTest.java new file mode 100644 index 0000000000..b8c6958125 --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFAltHeaderLineUnitTest.java @@ -0,0 +1,42 @@ +package htsjdk.variant.vcf; + +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class VCFAltHeaderLineUnitTest { + + @DataProvider(name = "allowedVCFVersions") + public Object[][] allowedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_3} + }; + } + + @DataProvider(name = "rejectedVCFVersions") + public Object[][] rejectedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF3_3}, + }; + } + + private static String altString = ""; + + @Test(dataProvider="allowedVCFVersions") + public void testAllowedVersions(final VCFHeaderVersion vcfAllowedVersion) { + VCFAltHeaderLine vcfLine = new VCFAltHeaderLine(altString, vcfAllowedVersion); + Assert.assertEquals("id", vcfLine.getID()); + Assert.assertEquals("desc", vcfLine.getGenericFieldValue(VCFStructuredHeaderLine.DESCRIPTION_ATTRIBUTE)); + } + + @Test(dataProvider="rejectedVCFVersions",expectedExceptions=TribbleException.class) + public void testRejectedVersions(final VCFHeaderVersion vcfAllowedVersion) { + new VCFAltHeaderLine(altString, vcfAllowedVersion); + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFCompoundHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFCompoundHeaderLineUnitTest.java index fe19fc5c06..a27388daa4 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFCompoundHeaderLineUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFCompoundHeaderLineUnitTest.java @@ -25,22 +25,221 @@ package htsjdk.variant.vcf; +import htsjdk.tribble.TribbleException; import htsjdk.variant.VariantBaseTest; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotNull; /** - * User: ebanks - * Date: Apr 2, 2014 + * Tests for VCFCompoundHeaderLine. + * + * NOTE: This class uses VCFInfoHeaderLine instances to test shared VCFCompoundHeaderLine functionality since + * VCFCompoundHeaderLine abstract. */ public class VCFCompoundHeaderLineUnitTest extends VariantBaseTest { + @DataProvider (name = "badOrMissingAttributes") + public Object[][] getMissingAttributes() { + return new Object[][] { + {""}, // no Type + {""}, // no Number + {""}, // bogus Type + {""}, // bogus Number + }; + } + + @Test(dataProvider= "badOrMissingAttributes", expectedExceptions=TribbleException.class) + public void testBadOrMissingAttributes(final String lineString) { + new VCFInfoHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION); + } + + @DataProvider (name = "acceptedAttributes") + public Object[][] getAcceptedAttributes() { + return new Object[][] { + {"", "Description", "foo"}, + //next two cases from https://github.com/samtools/htsjdk/issues/517 + {"", "Version", "3"}, + {"", "Source", "mySource"}, + }; + } + + @Test(dataProvider= "acceptedAttributes") + public void testAcceptedAttributes(final String lineString, final String attribute, final String expectedValue) { + final VCFCompoundHeaderLine headerline = new VCFInfoHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION); + Assert.assertEquals(headerline.getGenericFieldValue(attribute), expectedValue); + } + + @DataProvider (name = "headerLineTypes") + public Object[][] getHeaderLineTypes() { + return new Object[][] { + {"", VCFHeaderLineType.Float}, + {"", VCFHeaderLineType.Integer}, + {"", VCFHeaderLineType.String}, + {"", VCFHeaderLineType.Character}, + // Number must be 0 for flag type + {"", VCFHeaderLineType.Flag}, + }; + } + + @Test(dataProvider = "headerLineTypes") + public void testGetType(final String lineString, final VCFHeaderLineType expectedType) { + final VCFCompoundHeaderLine headerline = new VCFInfoHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION); + Assert.assertEquals(headerline.getType(), expectedType); + } + + @DataProvider (name = "headerLineCountTypes") + public Object[][] getLineCountTypes() { + return new Object[][] { + {"", VCFHeaderLineCount.A}, + {"", VCFHeaderLineCount.R}, + {"", VCFHeaderLineCount.G}, + {"", VCFHeaderLineCount.INTEGER}, + {"", VCFHeaderLineCount.UNBOUNDED}, + }; + } + + @Test(dataProvider= "headerLineCountTypes") + public void testGetLineCountType(final String lineString, final VCFHeaderLineCount expectedCountType) { + final VCFCompoundHeaderLine headerline = new VCFInfoHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION); + Assert.assertEquals(headerline.getCountType(), expectedCountType); + Assert.assertEquals(headerline.isFixedCount(), expectedCountType == VCFHeaderLineCount.INTEGER); + } + + @Test(expectedExceptions=TribbleException.class) + public void testRejectIntegerTypeWithNegativeCount() { + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION); + } + + @Test(expectedExceptions=TribbleException.class) + public void testRejectFlagTypeWithNegativeCount() { + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION); + } + + @DataProvider (name = "equalsData") + public Object[][] getEqualsData() { + return new Object[][] { + //pos + {"", + "", true}, + {"", + "", true}, + {"", + "", true}, + {"", + "", true}, + {"", + "", true}, + + //neg + {"", + "", false}, // different ID + {"", + "", false}, // different Type + {"", + "", false}, // different Number + {"", + "", false}, // different integer Number + {"", + "", false}, // different description + {"", + "", + "", + "" // merged result, promote to float + }, + { + "", + "", + "" // merged result, promote to float + }, + { + "", + "", + "" // merged result, resolve as new unbounded + }, + }; + } + + @Test(dataProvider = "compatibleMergerData") + public void testMergeCompatibleCompoundHeaderLines(final String line1, final String line2, final String expectedLine) { + VCFCompoundHeaderLine mergedLine = VCFCompoundHeaderLine.getSmartMergedCompoundHeaderLine( + new VCFInfoHeaderLine(line1, VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine(line2, VCFHeader.DEFAULT_VCF_VERSION), + new VCFHeader.HeaderConflictWarner(false), + (l1, l2) -> new VCFInfoHeaderLine( + l1.getID(), + VCFHeaderLineCount.UNBOUNDED, + l1.getType(), + l1.getDescription()) + ); + Assert.assertEquals(mergedLine, new VCFInfoHeaderLine(expectedLine, VCFHeader.DEFAULT_VCF_VERSION)); + } + + @DataProvider(name = "incompatibleMergerData") + public Object[][] getIncompatibleMergerData() { + return new Object[][]{ + { + "", + "" + }, + { + "", + "" + }, + }; + } + + @Test(dataProvider = "incompatibleMergerData", expectedExceptions=IllegalStateException.class) + public void testMergeIncompatibleCompoundHeaderLines(final String line1, final String line2) { + VCFCompoundHeaderLine.getSmartMergedCompoundHeaderLine( + new VCFInfoHeaderLine(line1, VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine(line2, VCFHeader.DEFAULT_VCF_VERSION), + new VCFHeader.HeaderConflictWarner(false), + (l1, l2) -> { throw new IllegalArgumentException("lambda should never execute - this exception should never be thrown"); } + ); + } + + @Test + public void testEncodeWithUnescapedQuotes() { + + VCFFilterHeaderLine unescapedFilterLine = new VCFFilterHeaderLine( + "aFilter", + "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \"NA\" || ANNOTATION <= 2.0]"); + + final String encodedAttributes = unescapedFilterLine.toStringEncoding(); + assertNotNull(encodedAttributes); + + final String expectedEncoding = "FILTER="; + assertEquals(encodedAttributes, expectedEncoding); + } + @Test - public void supportsVersionFields() { - final String line = ""; - final VCFCompoundHeaderLine headerline = new VCFInfoHeaderLine(line, VCFHeaderVersion.VCF4_2); - // if we don't support version fields then we should fail before we ever get here - Assert.assertTrue(true); + public void testEncodeWithEscapedQuotes() { + + VCFFilterHeaderLine escapedFilterLine = new VCFFilterHeaderLine("aFilter", "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \\\"NA\\\" || ANNOTATION <= 2.0]"); + final String encodedAttributes = escapedFilterLine.toStringEncoding(); + assertNotNull(encodedAttributes); + + final String expectedEncoding = "FILTER="; + assertEquals(encodedAttributes, expectedEncoding); } + } diff --git a/src/test/java/htsjdk/variant/vcf/VCFContigHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFContigHeaderLineUnitTest.java new file mode 100644 index 0000000000..8ada5cab4b --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFContigHeaderLineUnitTest.java @@ -0,0 +1,164 @@ +package htsjdk.variant.vcf; + +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.TreeSet; + +public class VCFContigHeaderLineUnitTest { + + @DataProvider(name = "allowedIDs") + public Object[][] getAllowedIDs() { + return new Object[][]{ + {"", "1"}, + {"", "10"}, + {"", "X"}, + {"", "Y"}, + {"", "MT"}, + {"", "NC_007605"}, + {"", "GL000191.1"}, + {"", "HLA-A*01:01:01:01"}, //https://github.com/samtools/hts-specs/issues/124 + }; + } + + @Test(dataProvider= "allowedIDs") + public void testAllowedIDs(final String lineString, final String expectedIDString) { + final VCFContigHeaderLine headerline = new VCFContigHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION, 0); + Assert.assertEquals(headerline.getID(), expectedIDString); + } + + @Test(expectedExceptions=TribbleException.class) + public void testRejectNegativeIndex() { + new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, -1); + } + + @DataProvider(name = "allowedAttributes") + public Object[][] getAllowedAttributes() { + return new Object[][] { + {"", "ID", "contig1"}, // https://github.com/samtools/htsjdk/issues/389 (no length) + {"", "length", "100"}, + {"", "taxonomy", "Homo sapiens"}, + {"", "assembly", "b37"}, + {"", "md5", "1a258fe76dfc8abd926f81f0e9b82ed7"}, + {"", + "URL", "http://www.refserve.org:8080/path/"}, + {"", + "species", "Homo sapiens"}, + }; + } + + @Test(dataProvider= "allowedAttributes") + public void testAllowedAttributes(final String lineString, final String attribute, final String expectedValue) { + final VCFContigHeaderLine headerline = new VCFContigHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION, 0); + Assert.assertEquals(headerline.getGenericFieldValue(attribute), expectedValue); + } + + @Test + public void testRoundTripThroughSequenceRecord() { + final VCFContigHeaderLine contigLine = new VCFContigHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION, + 0); + + String lengthString = "100"; + String assemblyString = "b37"; + String md5String = "1a258fe76dfc8abd926f81f0e9b82ed7"; + String URLString = "http://www.refserve.org:8080/path/"; + String speciesString = "Homo sapiens"; + + SAMSequenceRecord sequenceRecord = contigLine.getSAMSequenceRecord(); + + Assert.assertEquals(Integer.toString(sequenceRecord.getSequenceLength()), lengthString); + Assert.assertEquals(contigLine.getGenericFieldValue(VCFContigHeaderLine.LENGTH_ATTRIBUTE), lengthString); + + Assert.assertEquals(sequenceRecord.getAssembly(), assemblyString); + Assert.assertEquals(contigLine.getGenericFieldValue(VCFContigHeaderLine.ASSEMBLY_ATTRIBUTE), assemblyString); + + Assert.assertEquals(sequenceRecord.getMd5(), md5String); + Assert.assertEquals(contigLine.getGenericFieldValue(VCFContigHeaderLine.MD5_ATTRIBUTE), md5String); + + Assert.assertEquals(sequenceRecord.getAttribute(SAMSequenceRecord.URI_TAG), URLString); + Assert.assertEquals(contigLine.getGenericFieldValue(VCFContigHeaderLine.URL_ATTRIBUTE), URLString); + + Assert.assertEquals(sequenceRecord.getAttribute(SAMSequenceRecord.SPECIES_TAG), speciesString); + Assert.assertEquals(contigLine.getGenericFieldValue(VCFContigHeaderLine.SPECIES_ATTRIBUTE), speciesString); + + // now turn the SAMSequenceRecord back into a contig line, and compare the result to the + // original contig line + Assert.assertEquals( + new VCFContigHeaderLine(sequenceRecord, assemblyString), + contigLine); + } + + @DataProvider (name = "hashEqualsCompareData") + public Object[][] getHashEqualsCompareData() { + return new Object[][] { + + // For contig lines, equals and hash depend on the id, all other attributes, and the contig index, + // but compareTo only cares about the index. + + // line, index, line, line, index -> expected hash equals, expected equals, expected compare, + {"", 0, "", 0, true, true, 0 }, // identical + {"", 0, "", 1, false, false, -1 }, // identical except contig index + {"", 1, "", 0, false, false, 1 }, // identical except contig index + + {"", 0, "", 0, false, false, 0 }, // identical except attributes + {"", 0, "", 1, false, false, -1 }, // different attributes, different index + + {"", 0, "", 0, false, false, 0 }, // identical except ID + // different ID, same attributes and index, -> not equal, different hash, compare==0 + {"", 0, "", 0, false, false, 0 }, // different ID, attributes, same index + }; + } + + @Test(dataProvider = "hashEqualsCompareData") + public void testHashEqualsCompare( + final String line1, + final int index1, + final String line2, + final int index2, + final boolean expectedHashEquals, + final boolean expectedEquals, + final int expectedCompare) + { + final VCFContigHeaderLine headerLine1 = new VCFContigHeaderLine(line1, VCFHeader.DEFAULT_VCF_VERSION, index1); + final VCFContigHeaderLine headerLine2 = new VCFContigHeaderLine(line2, VCFHeader.DEFAULT_VCF_VERSION, index2); + + Assert.assertEquals(headerLine1.hashCode() == headerLine2.hashCode(), expectedHashEquals); + Assert.assertEquals(headerLine1.equals(headerLine2), expectedEquals); + Assert.assertEquals(headerLine1.compareTo(headerLine2), expectedCompare); + } + + @Test + public void testSortOrder() { + + final List expectedLineOrder = new ArrayList() {{ + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 1)); + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 2)); + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 10)); + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 20)); + }}; + + final TreeSet sortedLines = new TreeSet<>( + new ArrayList() {{ + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 20)); + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 10)); + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 1)); + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 2)); + }} + ); + + final Iterator sortedIt = sortedLines.iterator(); + for (VCFContigHeaderLine cl : expectedLineOrder) { + Assert.assertTrue(sortedIt.hasNext()); + Assert.assertEquals(cl, sortedIt.next()); + } + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFFormatHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFFormatHeaderLineUnitTest.java new file mode 100644 index 0000000000..25b6115bb6 --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFFormatHeaderLineUnitTest.java @@ -0,0 +1,17 @@ +package htsjdk.variant.vcf; + +import htsjdk.tribble.TribbleException; +import org.testng.annotations.Test; + +/** + * Test conditions that are unique to FORMAT lines (not covered by VCFCompoundHeaderLineUnitTest). + */ +public class VCFFormatHeaderLineUnitTest { + + // FORMAT lines aren't allowed to have type==Flag + @Test(expectedExceptions=TribbleException.class) + public void testRejectInfoLineWithFlagField() { + new VCFFormatHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION); + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java index 02cde53109..5d972c0176 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java @@ -1,43 +1,110 @@ package htsjdk.variant.vcf; +import htsjdk.tribble.TribbleException; import htsjdk.variant.VariantBaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.LinkedHashMap; -import java.util.Map; - -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertNotNull; public class VCFHeaderLineUnitTest extends VariantBaseTest { @Test - public void testEncodeVCFHeaderLineWithUnescapedQuotes() { + public void testIsNotStructuredHeaderLine() { + VCFHeaderLine hl = new VCFHeaderLine("key", "value"); + Assert.assertFalse(hl.isStructuredHeaderLine()); + Assert.assertNull(hl.getID()); + } - final Map attributes = new LinkedHashMap<>(); - attributes.put("ID", "VariantFiltration"); - attributes.put("CommandLineOptions", "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \"NA\" || ANNOTATION <= 2.0]"); + @Test + public void testStringEncoding() { + VCFHeaderLine hl = new VCFHeaderLine("key", "value"); + Assert.assertEquals(hl.toStringEncoding(), "key=value"); + } - final String encodedAttributes = VCFHeaderLine.toStringEncoding(attributes); - assertNotNull(encodedAttributes); + @DataProvider(name = "headerLineEquals") + public Object[][] headerLineEquals() { + return new Object[][]{ + { + new VCFHeaderLine("key", "value"), + new VCFHeaderLine("key", "value"), + true + }, + { + new VCFHeaderLine("key", "value1"), + new VCFHeaderLine("key", "value2"), + false + }, + { + new VCFHeaderLine("key1", "value"), + new VCFHeaderLine("key2", "value"), + false + }, + { + new VCFHeaderLine("key1", "value1"), + new VCFHeaderLine("key2", "value2"), + false + } + }; + } - final String expectedEncoding = ""; - assertEquals(encodedAttributes, expectedEncoding); + @Test(dataProvider = "headerLineEquals") + public void testEquals(final VCFHeaderLine hl1, final VCFHeaderLine hl2, final boolean expectedEquals) { + Assert.assertEquals(hl1.equals(hl2), expectedEquals); } + @DataProvider(name = "invalidHeaderLineKeys") + public Object[][] invalidHeaderLineKeys() { + return new Object[][]{ + {null}, + {"embedded<"}, + {"embedded="}}; + } - @Test - public void testEncodeVCFHeaderLineWithEscapedQuotes() { + @Test(dataProvider = "invalidHeaderLineKeys", expectedExceptions=TribbleException.class) + public void testInvalidKeys(final String testKey) { + new VCFHeaderLine(testKey, ""); + } + + @Test(dataProvider = "invalidHeaderLineKeys", expectedExceptions=TribbleException.class) + public void testValidateAsIdInvalid(final String testKey) { + VCFHeaderLine.validateAsID(testKey, "test"); + } - final Map attributes = new LinkedHashMap<>(); - attributes.put("ID", "VariantFiltration"); - attributes.put("CommandLineOptions", "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \\\"NA\\\" || ANNOTATION <= 2.0]"); + @DataProvider(name = "vcfVersions") + public Object[][] vcfVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_3} + }; + } - final String encodedAttributes = VCFHeaderLine.toStringEncoding(attributes); - assertNotNull(encodedAttributes); + @Test(dataProvider = "vcfVersions") + public void testValidateForVersion(final VCFHeaderVersion vcfVersion) { + VCFHeaderLine headerLine = new VCFHeaderLine(vcfVersion.getFormatString(), vcfVersion.getVersionString()); + headerLine.validateForVersion(vcfVersion); + } - final String expectedEncoding = ""; - assertEquals(encodedAttributes, expectedEncoding); + @DataProvider(name = "incompatibleVersions") + public Object[][] incompatibleVersionPairs() { + return new Object[][]{ + // each pair just has to be different + {VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_2} + }; } + @Test(dataProvider="incompatibleVersions", expectedExceptions=TribbleException.class) + public void testValidateForVersionFails(final VCFHeaderVersion vcfVersion, final VCFHeaderVersion incompatibleVersion) { + VCFHeaderLine headerLine = new VCFHeaderLine(vcfVersion.getFormatString(), vcfVersion.getVersionString()); + headerLine.validateForVersion(incompatibleVersion); + } } diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java index e9135cc723..beb9ec48f9 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java @@ -25,6 +25,8 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.TestUtil; import htsjdk.tribble.TribbleException; @@ -37,24 +39,13 @@ import org.testng.Assert; import org.testng.annotations.AfterClass; import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.InputStream; -import java.io.PrintWriter; -import java.io.StringReader; -import java.math.BigInteger; -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.EnumSet; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; +import java.io.*; +import java.util.*; + +import static htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder.NO_OPTIONS; /** * Created by IntelliJ IDEA. @@ -71,10 +62,27 @@ private VCFHeader createHeader(String headerStr) { VCFCodec codec = new VCFCodec(); VCFHeader header = (VCFHeader) codec.readActualHeader(new LineIteratorImpl(new SynchronousLineReader( new StringReader(headerStr)))); + //the "#CHROM..." header line isn't returned by getMetaDataInInputOrder Assert.assertEquals(header.getMetaDataInInputOrder().size(), VCF4headerStringCount); return header; } + private Set getV42HeaderLinesWithNoFormatString() { + // precondition - create a v42 header and make sure its v42 + VCFHeader header = createHeader(VCF42headerStrings); + Set metaDataSet = new LinkedHashSet<>(header.getMetaDataInInputOrder()); + VCFHeaderLine versionLine = VCFHeader.getVersionLineFromHeaderLineSet(metaDataSet); + // precondition - make sure its v42 to start with + Assert.assertEquals( + VCFHeaderVersion.toHeaderVersion(versionLine.getValue()), + VCFHeaderVersion.VCF4_2); + + // remove the 4.2 version line from the original set, verify, and return the set with no fileformat string + metaDataSet.remove(versionLine); + Assert.assertNull(VCFHeader.getVersionLineFromHeaderLineSet(metaDataSet)); + return metaDataSet; + } + @BeforeClass private void createTemporaryDirectory() { tempDir = TestUtil.getTempDirectory("VCFHeader", "VCFHeaderTest"); @@ -89,15 +97,17 @@ private void deleteTemporaryDirectory() { } @Test - public void testVCF4ToVCF4() { - VCFHeader header = createHeader(VCF4headerStrings); - checkMD5ofHeaderFile(header, "91c33dadb92e01ea349bd4bcdd02d6be"); + public void testVCF4ToVCF4() throws IOException { + VCFHeader header = createHeader(VCF42headerStrings); + Set roundTripped = getRoundTripEncoded(header); + Assert.assertTrue(roundTripped.equals(header.getMetaDataInSortedOrder())); } @Test - public void testVCF4ToVCF4_alternate() { + public void testVCF4ToVCF4_alternate() throws IOException { VCFHeader header = createHeader(VCF4headerStrings_with_negativeOne); - checkMD5ofHeaderFile(header, "39318d9713897d55be5ee32a2119853f"); + Set roundTripped = getRoundTripEncoded(header); + Assert.assertTrue(roundTripped.equals(header.getMetaDataInSortedOrder())); } @Test @@ -131,7 +141,10 @@ public void testVCFHeaderDictionaryMerging() { headerOne.getSequenceDictionary().assertSameDictionary(headerTwo.getSequenceDictionary()); // Run the merge command - final VCFHeader mergedHeader = new VCFHeader(VCFUtils.smartMergeHeaders(Arrays.asList(headerOne, headerTwo), false), sampleList); + final VCFHeader mergedHeader = new VCFHeader( + VCFUtils.smartMergeHeaders(Arrays.asList(headerOne, headerTwo),false), + sampleList + ); // Check that the mergedHeader's sequence dictionary matches the first two mergedHeader.getSequenceDictionary().assertSameDictionary(headerOne.getSequenceDictionary()); @@ -197,8 +210,8 @@ public void testVCFHeaderAddFormatLine() { public void testVCFHeaderAddFilterLine() { final VCFHeader header = getHiSeqVCFHeader(); final String filterDesc = "TestFilterLine Description"; - final VCFFilterHeaderLine filterLine = new VCFFilterHeaderLine("TestFilterLine",filterDesc); - Assert.assertEquals(filterDesc,filterLine.getDescription()); + final VCFFilterHeaderLine filterLine = new VCFFilterHeaderLine("TestFilterLine", filterDesc); + Assert.assertEquals(filterDesc, filterLine.getDescription()); header.addMetaDataLine(filterLine); Assert.assertTrue(header.getFilterLines().contains(filterLine), "TestFilterLine not found in filter header lines"); @@ -214,7 +227,11 @@ public void testVCFHeaderAddFilterLine() { @Test public void testVCFHeaderAddContigLine() { final VCFHeader header = getHiSeqVCFHeader(); - final VCFContigHeaderLine contigLine = new VCFContigHeaderLine("", VCFHeaderVersion.VCF4_0, "chr1", 0); + //TODO: Note: This test was previously adding a "contig" header line with key "chr1", which + //would roundtrip (through a file) as a VCFHeaderLine, not a VCFContigHeaderLine + final VCFContigHeaderLine contigLine = new VCFContigHeaderLine("", VCFHeaderVersion.VCF4_0, 0); + Assert.assertEquals(contigLine.getKey(), VCFHeader.CONTIG_KEY); + Assert.assertEquals(contigLine.getID(), "chr1"); header.addMetaDataLine(contigLine); Assert.assertTrue(header.getContigLines().contains(contigLine), "Test contig line not found in contig header lines"); @@ -226,6 +243,22 @@ public void testVCFHeaderAddContigLine() { Assert.assertFalse(header.getOtherHeaderLines().contains(contigLine), "Test contig line present in other header lines"); } + + //TODO: This is a new test, but it passes in both the old and new implementations ? Should this be allowed ? + // It seems wrong, VCFHeader allows two contig lines with the same contig index to reside in the header + @Test + public void testVCFHeaderAddContigLineWithSameIndex() { + final VCFHeader header = new VCFHeader(); + final VCFContigHeaderLine contigLine1 = new VCFContigHeaderLine("", VCFHeaderVersion.VCF4_2, 0); + final VCFContigHeaderLine contigLine2 = new VCFContigHeaderLine("", VCFHeaderVersion.VCF4_2, 0); + + header.addMetaDataLine(contigLine1); + header.addMetaDataLine(contigLine2); + + Assert.assertTrue(header.getContigLines().contains(contigLine1)); + Assert.assertTrue(header.getContigLines().contains(contigLine2)); + } + @Test public void testVCFHeaderAddOtherLine() { final VCFHeader header = getHiSeqVCFHeader(); @@ -272,7 +305,7 @@ public void testVCFHeaderAddDuplicateContigLine() { final int numContigLinesBefore = header.getContigLines().size(); - // try to readd the first contig line + // try to read the first contig line header.addMetaDataLine(header.getContigLines().get(0)); final int numContigLinesAfter = header.getContigLines().size(); @@ -281,7 +314,7 @@ public void testVCFHeaderAddDuplicateContigLine() { } @Test - public void testVCFHeaderAddDuplicateHeaderLine() { + public void testVCFHeaderAddDuplicateKeyValueHeaderLine() { File input = new File("src/test/resources/htsjdk/variant/ex2.vcf"); VCFFileReader reader = new VCFFileReader(input, false); @@ -292,11 +325,13 @@ public void testVCFHeaderAddDuplicateHeaderLine() { header.addMetaDataLine(newHeaderLine); final int numHeaderLinesBefore = header.getOtherHeaderLines().size(); - // readd the same header line + // add the same header line again header.addMetaDataLine(newHeaderLine); final int numHeaderLinesAfter = header.getOtherHeaderLines().size(); - // assert that we have the same number of other header lines before and after + // TODO: Note: This change assumes we don't allow duplicate unstructured + // lines with the same key unless they have different content + // assert that we have the one more other header line after Assert.assertEquals(numHeaderLinesBefore, numHeaderLinesAfter); } @@ -312,7 +347,7 @@ public void testVCFHeaderSerialization() throws Exception { Assert.assertEquals(deserializedHeader.getContigLines(), originalHeader.getContigLines(), "Contig header lines do not match before/after serialization"); Assert.assertEquals(deserializedHeader.getFilterLines(), originalHeader.getFilterLines(), "Filter header lines do not match before/after serialization"); Assert.assertEquals(deserializedHeader.getFormatHeaderLines(), originalHeader.getFormatHeaderLines(), "Format header lines do not match before/after serialization"); - Assert.assertEquals(deserializedHeader.getIDHeaderLines(), originalHeader.getIDHeaderLines(), "ID header lines do not match before/after serialization"); + Assert.assertEquals(deserializedHeader.getStructuredHeaderLines(), originalHeader.getStructuredHeaderLines(), "ID header lines do not match before/after serialization"); Assert.assertEquals(deserializedHeader.getInfoHeaderLines(), originalHeader.getInfoHeaderLines(), "Info header lines do not match before/after serialization"); Assert.assertEquals(deserializedHeader.getOtherHeaderLines(), originalHeader.getOtherHeaderLines(), "Other header lines do not match before/after serialization"); Assert.assertEquals(deserializedHeader.getGenotypeSamples(), originalHeader.getGenotypeSamples(), "Genotype samples not the same before/after serialization"); @@ -322,6 +357,309 @@ public void testVCFHeaderSerialization() throws Exception { Assert.assertEquals(deserializedHeader.toString(), originalHeader.toString(), "String representation of header not the same before/after serialization"); } + @DataProvider(name = "vcfVersions") + public Object[][] vcfVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_3} + }; + } + + @Test(dataProvider = "vcfVersions") + public void testCreateHeaderWithNoFileFormatLine(final VCFHeaderVersion vcfVersion) { + Set metaDataSet = getV42HeaderLinesWithNoFormatString(); // this (4.2) header is compatible with all 4.x versions + + // create a new versioned header from this set (containing no fileformat line) + // which should always default to 4.2 + VCFHeader vcfHeader = new VCFHeader(vcfVersion, metaDataSet, Collections.EMPTY_SET); + Assert.assertEquals(vcfHeader.getHeaderVersion(), vcfVersion); + } + + @Test(dataProvider = "vcfVersions") + public void testCreateHeaderWithMatchingFileFormatLine(final VCFHeaderVersion vcfVersion) { + Set metaDataSet = getV42HeaderLinesWithNoFormatString(); // 4.2 header is compatible with all 4.x versions + + // add in the corresponding fileformat line; create a new versioned header + // since the version requested in the constructor and the format lines are in sync, there is + // no conflict, and the resulting header's version should always match the requested version + metaDataSet.add(new VCFHeaderLine(vcfVersion.getFormatString(), vcfVersion.getVersionString())); + VCFHeader vcfHeader = new VCFHeader(vcfVersion, metaDataSet, Collections.EMPTY_SET); + Assert.assertEquals(vcfHeader.getHeaderVersion(), vcfVersion); + } + + @Test(expectedExceptions = TribbleException.class) + public void testCreateHeaderWithMultipleFileFormatLines() { + Set metaDataSet = getV42HeaderLinesWithNoFormatString(); // this (4.2) header is compatible with all 4.x versions + int beforeSize = metaDataSet.size(); + + metaDataSet.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_2.getFormatString(), VCFHeaderVersion.VCF4_2.getVersionString())); + metaDataSet.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_1.getFormatString(), VCFHeaderVersion.VCF4_1.getVersionString())); + Assert.assertEquals(metaDataSet.size(), beforeSize + 2); + + // create a new versioned header from this set (containing no fileformat line) + // which should always default to 4.2 + new VCFHeader(metaDataSet, Collections.EMPTY_SET); + } + + @Test(dataProvider = "vcfVersions") + public void testSetHeaderVersionWithFileFormatLine(final VCFHeaderVersion vcfVersion) { + Set metaDataSet = getV42HeaderLinesWithNoFormatString(); // 4.2 header is compatible with all 4.x versions + + // don't request a version; let the header derive it from the embedded format line; + // the resulting header version should match the format line we embedded + metaDataSet.add(new VCFHeaderLine(vcfVersion.getFormatString(), vcfVersion.getVersionString())); + VCFHeader vcfHeader = new VCFHeader(metaDataSet, Collections.EMPTY_SET); //defaults to v4.2 + Assert.assertEquals(vcfHeader.getHeaderVersion(), vcfVersion); + vcfHeader.setHeaderVersion(vcfVersion); + } + + @Test(dataProvider = "vcfVersions") + public void testSetHeaderVersionWithNoFileFormatLine(final VCFHeaderVersion vcfVersion) { + Set metaDataSet = getV42HeaderLinesWithNoFormatString(); // 4.2 header is compatible with all 4.x versions + + // create a new header from this set (containing no fileformat line), no requested version in constructor + VCFHeader vcfHeader = new VCFHeader(metaDataSet, Collections.EMPTY_SET); //defaults to v4.2 + vcfHeader.setHeaderVersion(vcfVersion); + Assert.assertEquals(vcfHeader.getHeaderVersion(), VCFHeader.DEFAULT_VCF_VERSION); + } + + @DataProvider(name = "conflictingHeaderVersionPairs") + public Object[][] vcfConflictingVersionLines() { + return new Object[][]{ + {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_3}, + + {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_3}, + + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_3}, + + {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_2}, + }; + } + + @Test(dataProvider = "conflictingHeaderVersionPairs", expectedExceptions = IllegalArgumentException.class) + public void testCreateHeaderWithConflictingFileFormatLine( + final VCFHeaderVersion vcfVersion, + final VCFHeaderVersion conflictingVersion) { + Set metaDataSet = getV42HeaderLinesWithNoFormatString(); // 4.2 header is compatible with all 4.x versions + + //add in the fileformat line; create a new header requesting conflicting version + metaDataSet.add(new VCFHeaderLine(vcfVersion.getFormatString(), vcfVersion.getVersionString())); + new VCFHeader(conflictingVersion, metaDataSet, Collections.EMPTY_SET); + } + + @Test(dataProvider = "conflictingHeaderVersionPairs", expectedExceptions = TribbleException.class) + public void testSetHeaderWithConflictingVersion(final VCFHeaderVersion vcfVersion, final VCFHeaderVersion conflictingVersion) { + Set metaDataSet = getV42HeaderLinesWithNoFormatString(); // 4.2 header is compatible with all 4.x versions + + //add in a fileformat line; create a new header; setHeader with a conflicting version + metaDataSet.add(new VCFHeaderLine(vcfVersion.getFormatString(), vcfVersion.getVersionString())); + VCFHeader vcfHeader = new VCFHeader(metaDataSet, Collections.EMPTY_SET); + vcfHeader.setHeaderVersion(conflictingVersion); + } + + @Test(expectedExceptions = TribbleException.class) + public void testAddSecondFileFormatLine() { + Set metaDataSet = getV42HeaderLinesWithNoFormatString(); // 4.2 header is compatible with all 4.x versions + + //add in a fileformat line that matches the default version; create a new header + metaDataSet.add(new VCFHeaderLine(VCFHeader.DEFAULT_VCF_VERSION.getFormatString(), VCFHeader.DEFAULT_VCF_VERSION.getVersionString())); + VCFHeader vcfHeader = new VCFHeader(metaDataSet, Collections.EMPTY_SET); + Assert.assertEquals(vcfHeader.getHeaderVersion(), VCFHeader.DEFAULT_VCF_VERSION); + + // try to add another identical fileformat header line + vcfHeader.addMetaDataLine(new VCFHeaderLine(VCFHeaderVersion.VCF4_2.getFormatString(), VCFHeader.DEFAULT_VCF_VERSION.getVersionString())); + } + + @Test(expectedExceptions = TribbleException.class) + public void testAddConflictingFileFormatLine() { + Set metaDataSet = getV42HeaderLinesWithNoFormatString(); // 4.2 header is compatible with all 4.x versions + + //add in a fileformat line that matches the default version; create a new header + metaDataSet.add(new VCFHeaderLine(VCFHeader.DEFAULT_VCF_VERSION.getFormatString(), VCFHeader.DEFAULT_VCF_VERSION.getVersionString())); + VCFHeader vcfHeader = new VCFHeader(metaDataSet, Collections.EMPTY_SET); + Assert.assertEquals(vcfHeader.getHeaderVersion(), VCFHeader.DEFAULT_VCF_VERSION); + + // now add a conflicting fileformat header line + vcfHeader.addMetaDataLine(new VCFHeaderLine(VCFHeaderVersion.VCF4_1.getFormatString(), VCFHeaderVersion.VCF4_1.getVersionString())); + } + + @Test + public void testSilentlyRejectDuplicateContigLines() { + // Note: This is testing a case that failed with the previous implementation, when both of these + // lines were added to the master list, but only one was added to the contig line list. The two + // lines have identical key/ID values, but because they have different attributes, they have + // different hashCodes, and so can both reside in a Set. + VCFContigHeaderLine contigOneNoAssembly = new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "1"); + put("length", "123"); + }}, + 0); + VCFContigHeaderLine contigOneWithAssembly = new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "1"); + put("length", "123"); + put("assembly", "b37"); + }}, + 0); + Assert.assertNotEquals(contigOneNoAssembly.hashCode(), contigOneWithAssembly.hashCode()); + + Set headerLineSet = new LinkedHashSet<>(); + headerLineSet.add(contigOneNoAssembly); + headerLineSet.add(contigOneWithAssembly); // silently dropped since it has the same id + Assert.assertEquals(headerLineSet.size(), 2); + + VCFHeader vcfHeader = new VCFHeader(headerLineSet); + Set allMetaDataInput = vcfHeader.getMetaDataInInputOrder(); + Assert.assertEquals(allMetaDataInput.size(), 1); + + Set allMetaDataSorted = vcfHeader.getMetaDataInSortedOrder(); + Assert.assertEquals(allMetaDataSorted.size(), 1); + + List allContigLines = vcfHeader.getContigLines(); + Assert.assertEquals(allContigLines.size(), 1); // one contig + Assert.assertNull(allContigLines.get(0).getGenericFieldValue("assembly")); + } + + @Test(dataProvider = "conflictingHeaderVersionPairs") + public void test_MergeHeadersAcrossVersions( + final VCFHeaderVersion vcfVersion, + final VCFHeaderVersion conflictingVersion) + { + Set metaDataSet = getV42HeaderLinesWithNoFormatString(); + metaDataSet.add(new VCFHeaderLine(vcfVersion.getFormatString(), vcfVersion.getVersionString())); + VCFHeader header = new VCFHeader(metaDataSet); + Assert.assertEquals(header.getHeaderVersion(), vcfVersion); + + Set conflictSet = getV42HeaderLinesWithNoFormatString(); + conflictSet.add(new VCFHeaderLine(conflictingVersion.getFormatString(), conflictingVersion.getVersionString())); + VCFHeader conflictingHeader = new VCFHeader(conflictSet); + Assert.assertEquals(conflictingHeader.getHeaderVersion(), conflictingVersion); + + List headerList = new ArrayList(2); + headerList.add(header); + headerList.add(conflictingHeader); + + // smartMergeHeaders strips out fileformat lines and returns the remaining merged header lines + Set mergedSet = VCFUtils.smartMergeHeaders(headerList, false); + + // create a header from the merged set, which should defautl to the default version + VCFHeader mergedHeader = new VCFHeader(mergedSet); + Assert.assertEquals(mergedHeader.getHeaderVersion(), VCFHeader.DEFAULT_VCF_VERSION); + + // all the header lines in the merged set are also in the resulting header + Assert.assertEquals(mergedHeader.getMetaDataInInputOrder(), mergedSet); + + // since we merged two headers that are identical except for the fileformat line, assert that all + // the original header lines are in the resulting header + metaDataSet.add(new VCFHeaderLine(VCFHeader.DEFAULT_VCF_VERSION.getFormatString(), VCFHeader.DEFAULT_VCF_VERSION.getVersionString())); + Assert.assertEquals(mergedHeader.getMetaDataInInputOrder(), mergedSet); + } + + private LinkedHashSet getHeaderLineListWithoutLine( + final LinkedHashSet inputSet, + final int n) { + List headerLineList = new ArrayList<>(inputSet); + headerLineList.remove(n); + return new LinkedHashSet<>(headerLineList); + } + + @DataProvider(name = "mergeHeaderData") + public Iterator mergeHeaderData() + { + List headerLineList = new ArrayList<>(new VCFHeaderUnitTestData().getFullMetaDataLinesAsSet()); + Collection mergeTestCase = new ArrayList<>(); + for (int i = 0; i < headerLineList.size(); i++) { + mergeTestCase.add( + new Object[] { + new VCFHeader(new VCFHeaderUnitTestData().getFullMetaDataLinesAsSet()), + new VCFHeader(getHeaderLineListWithoutLine(new VCFHeaderUnitTestData().getFullMetaDataLinesAsSet(), i)) + }); + } + + return mergeTestCase.iterator(); + } + + @Test(dataProvider = "mergeHeaderData") + public void testMergeHeaders( + final VCFHeader fullHeader, + final VCFHeader subsetHeader) + { + List headerList = new ArrayList() {{ + add(fullHeader); + add(subsetHeader); + }}; + LinkedHashSet mergedList = new LinkedHashSet(VCFHeader.getMergedHeaderLines(headerList, false)); + + // We want to compare the set returned from the merger with the original set, but merging removes + // fileformat lines, so we need to remove the same fileformat line from the original set for comparison purposes + LinkedHashSet fullHeaderListWithoutFileFormatLine = new LinkedHashSet(fullHeader.getMetaDataInInputOrder()); + if (false == fullHeaderListWithoutFileFormatLine.remove(fullHeader.getOtherHeaderLine("fileformat"))) { + // one of the test cases has the fileformat line removed from the subsetted list; make sure this is it + Assert.assertNull(fullHeader.getOtherHeaderLine("fileformat")); + } else { + Assert.assertNotNull(fullHeader.getOtherHeaderLine("fileformat")); + } + + Assert.assertEquals(new TreeSet<>(fullHeaderListWithoutFileFormatLine), new TreeSet<>(mergedList)); + } + + @Test + public void testPreserveSequenceDictionaryAttributes() { + // Round trip a SAMSequenceDictionary with attributes, through a VCFHeader, and back + // to a SAMSequenceDictionary with the same attributes. + // https://github.com/samtools/htsjdk/issues/730 + + final String assemblyString = "hg37"; + final String md5String = "68b329da9893e34099c7d8ad5cb9c940"; + final String speciesString = "Home Sapiens"; + final String urlString = "http://www.refserve.org:8080/path/"; + + SAMSequenceDictionary samDict = new SAMSequenceDictionary(); + + final SAMSequenceRecord seqRec1 = new SAMSequenceRecord("1", 1); + seqRec1.setAssembly(assemblyString); + seqRec1.setMd5(md5String); + seqRec1.setAttribute(SAMSequenceRecord.URI_TAG, urlString); + seqRec1.setSpecies(speciesString); + final SAMSequenceRecord seqRec2 = new SAMSequenceRecord("2", 1); + samDict.addSequence(seqRec1); + samDict.addSequence(seqRec2); + + VCFHeader vcfHeader = new VCFHeader(); + vcfHeader.setSequenceDictionary(samDict); + SAMSequenceDictionary roundTrippedDict = vcfHeader.getSequenceDictionary(); + + final SAMSequenceRecord rtRec1 = roundTrippedDict.getSequence("1"); + Assert.assertEquals(assemblyString, rtRec1.getAssembly()); + Assert.assertEquals(md5String, rtRec1.getMd5()); + Assert.assertEquals(urlString, rtRec1.getAttribute(SAMSequenceRecord.URI_TAG)); + Assert.assertEquals(speciesString, rtRec1.getSpecies()); + + Assert.assertEquals(seqRec1, roundTrippedDict.getSequence("1")); // somewhat redundant check on full record + Assert.assertEquals(seqRec2, roundTrippedDict.getSequence("2")); + } + @Test public void testVCFHeaderQuoteEscaping() throws Exception { // this test ensures that the end-to-end process of quote escaping is stable when headers are @@ -334,11 +672,9 @@ public void testVCFHeaderQuoteEscaping() throws Exception { final VCFFileReader originalFileReader = new VCFFileReader(new File("src/test/resources/htsjdk/variant/VCF4HeaderTest.vcf"), false); final VCFHeader originalHeader = originalFileReader.getFileHeader(); - // add a header line with quotes to the header - final Map attributes = new LinkedHashMap<>(); - attributes.put("ID", "VariantFiltration"); - attributes.put("CommandLineOptions", "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \"NA\" || ANNOTATION <= 2.0]"); - final VCFSimpleHeaderLine addedHeaderLine = new VCFSimpleHeaderLine("GATKCommandLine.Test", attributes); + final VCFStructuredHeaderLine addedHeaderLine = new VCFFilterHeaderLine( + "FakeFilter", + "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \"NA\" || ANNOTATION <= 2.0]"); originalHeader.addMetaDataLine(addedHeaderLine); final VCFFilterHeaderLine originalCopyAnnotationLine1 = originalHeader.getFilterHeaderLine("ANNOTATION"); @@ -382,7 +718,7 @@ public void testVCFHeaderQuoteEscaping() throws Exception { // read the copied file back in final VCFFileReader firstCopyReader = new VCFFileReader(firstCopyVCFFile, false); final VCFHeader firstCopyHeader = firstCopyReader.getFileHeader(); - final VCFHeaderLine firstCopyNewHeaderLine = firstCopyHeader.getOtherHeaderLine("GATKCommandLine.Test"); + final VCFFilterHeaderLine firstCopyNewHeaderLine = firstCopyHeader.getFilterHeaderLine("FakeFilter"); Assert.assertNotNull(firstCopyNewHeaderLine); final VCFFilterHeaderLine firstCopyAnnotationLine1 = firstCopyHeader.getFilterHeaderLine("ANNOTATION"); @@ -425,7 +761,7 @@ public void testVCFHeaderQuoteEscaping() throws Exception { final VCFFileReader secondCopyReader = new VCFFileReader(secondCopyVCFFile, false); final VCFHeader secondCopyHeader = secondCopyReader.getFileHeader(); - final VCFHeaderLine secondCopyNewHeaderLine = secondCopyHeader.getOtherHeaderLine("GATKCommandLine.Test"); + final VCFFilterHeaderLine secondCopyNewHeaderLine = secondCopyHeader.getFilterHeaderLine("FakeFilter"); Assert.assertNotNull(secondCopyNewHeaderLine); final VCFFilterHeaderLine secondCopyAnnotationLine1 = secondCopyHeader.getFilterHeaderLine("ANNOTATION"); @@ -435,8 +771,8 @@ public void testVCFHeaderQuoteEscaping() throws Exception { Assert.assertNotNull(secondCopyAnnotationLine2); Assert.assertEquals(firstCopyNewHeaderLine, secondCopyNewHeaderLine); - Assert.assertEquals(firstCopyNewHeaderLine.toStringEncoding(), "GATKCommandLine.Test="); - Assert.assertEquals(secondCopyNewHeaderLine.toStringEncoding(), "GATKCommandLine.Test="); + Assert.assertEquals(firstCopyNewHeaderLine.toStringEncoding(), "FILTER="); + Assert.assertEquals(secondCopyNewHeaderLine.toStringEncoding(), "FILTER="); Assert.assertEquals(firstCopyAnnotationLine1, secondCopyAnnotationLine1); Assert.assertEquals(secondCopyAnnotationLine1.getGenericFieldValue("Description"), "ANNOTATION != \"NA\" || ANNOTATION <= 0.01"); @@ -460,70 +796,27 @@ public void testVCFHeaderQuoteEscaping() throws Exception { } - /** - * a little utility function for all tests to md5sum a file - * Shameless taken from: - *

- * http://www.javalobby.org/java/forums/t84420.html - * - * @param file the file - * @return a string - */ - private static String md5SumFile(File file) { - MessageDigest digest; - try { - digest = MessageDigest.getInstance("MD5"); - } catch (NoSuchAlgorithmException e) { - throw new RuntimeException("Unable to find MD5 digest"); - } - InputStream is; - try { - is = new FileInputStream(file); - } catch (FileNotFoundException e) { - throw new RuntimeException("Unable to open file " + file); - } - byte[] buffer = new byte[8192]; - int read; - try { - while ((read = is.read(buffer)) > 0) { - digest.update(buffer, 0, read); - } - byte[] md5sum = digest.digest(); - BigInteger bigInt = new BigInteger(1, md5sum); - return bigInt.toString(16); - - } catch (IOException e) { - throw new RuntimeException("Unable to process file for MD5", e); - } finally { - try { - is.close(); - } catch (IOException e) { - throw new RuntimeException("Unable to close input stream for MD5 calculation", e); - } - } - } - - private void checkMD5ofHeaderFile(VCFHeader header, String md5sum) { - File myTempFile = null; - PrintWriter pw = null; - try { - myTempFile = File.createTempFile("VCFHeader", "vcf"); - myTempFile.deleteOnExit(); - pw = new PrintWriter(myTempFile); - } catch (IOException e) { - Assert.fail("Unable to make a temp file!"); + // Serialize/encode the header to a file, read metaData back in + private Set getRoundTripEncoded(VCFHeader header) throws IOException { + File myTempFile = File.createTempFile("VCFHeader", "vcf"); + try (final VariantContextWriter vcfWriter = + new VariantContextWriterBuilder() + .setOutputFile(myTempFile) + .setOutputFileType(VariantContextWriterBuilder.OutputType.VCF) + .setOptions(NO_OPTIONS) + .build()) { + vcfWriter.writeHeader(header); } - for (VCFHeaderLine line : header.getMetaDataInSortedOrder()) - pw.println(line); - pw.close(); - Assert.assertEquals(md5SumFile(myTempFile), md5sum); + VCFHeader vcfHeader = (VCFHeader) new VCFCodec().readActualHeader(new LineIteratorImpl( + new SynchronousLineReader(new FileReader(myTempFile.getAbsolutePath())))); + return vcfHeader.getMetaDataInSortedOrder(); } - public static int VCF4headerStringCount = 16; + public static int VCF4headerStringCount = 16; // 17 -1 for the #CHROM... line - public static String VCF4headerStrings = + public static String VCF42headerStrings = "##fileformat=VCFv4.2\n" + - "##filedate=2010-06-21\n" + + "##filedate=2010-06-21\n" + "##reference=NCBI36\n" + "##INFO=\n" + "##INFO=\n" + @@ -540,7 +833,6 @@ private void checkMD5ofHeaderFile(VCFHeader header, String md5sum) { "##FORMAT=\n" + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; - public static String VCF4headerStrings_with_negativeOne = "##fileformat=VCFv4.2\n" + "##filedate=2010-06-21\n" + diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTestData.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTestData.java new file mode 100644 index 0000000000..fd242072b6 --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTestData.java @@ -0,0 +1,85 @@ +package htsjdk.variant.vcf; + +import org.testng.Assert; + +import java.util.*; + +// Unit test data used by unit tests for VCFHeader, VCFMetaDataLines, and VCFHeaderLine hierarchy. +public class VCFHeaderUnitTestData { + public VCFHeaderVersion canonicalVersion = VCFHeader.DEFAULT_VCF_VERSION; + + // fileformat line + public List fileformatLines = new ArrayList() {{ + add(new VCFHeaderLine(canonicalVersion.getFormatString(), canonicalVersion.getVersionString())); + }}; + + // FILTER lines + public List filterLines = new ArrayList() {{ + add(new VCFFilterHeaderLine("LowQual", "Description=\"Low quality\"")); + add(new VCFFilterHeaderLine("highDP", "Description=\"DP < 8\"")); + add(new VCFFilterHeaderLine("TruthSensitivityTranche98.50to98.80", "Truth sensitivity tranche level at VSQ Lod: -0.1106 <= x < 0.6654")); + }}; + + // FORMAT lines + public List formatLines = new ArrayList() {{ + add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); + add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Integer, "Genotype Quality")); + add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)")); + add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); + add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, VCFHeaderLineCount.R, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed")); + add(new VCFFormatHeaderLine(VCFConstants.PHASE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Read-backed phasing quality")); + add(new VCFFormatHeaderLine("MLPSAF", VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the alternate allele fraction")); + add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_FILTER_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Genotype-level filter")); + }}; + + // INFO lines + public List infoLines = new ArrayList() {{ + add(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval")); + add(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership")); + add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth; some reads may have been filtered")); + add(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias")); + add(new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed")); + add(new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed")); + add(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes")); + add(new VCFInfoHeaderLine(VCFConstants.MAPPING_QUALITY_ZERO_KEY, 1, VCFHeaderLineType.Integer, "Total Mapping Quality Zero Reads")); + add(new VCFInfoHeaderLine(VCFConstants.RMS_MAPPING_QUALITY_KEY, 1, VCFHeaderLineType.Float, "RMS Mapping Quality")); + add(new VCFInfoHeaderLine(VCFConstants.SOMATIC_KEY, 0, VCFHeaderLineType.Flag, "Somatic event")); + }}; + + // CONTIG lines + public List contigLines = new ArrayList() {{ + add(new VCFContigHeaderLine(Collections.singletonMap("ID", "1"), 0)); + add(new VCFContigHeaderLine(Collections.singletonMap("ID", "2"), 1)); + add(new VCFContigHeaderLine(Collections.singletonMap("ID", "3"), 2)); + }}; + + //misc lines + public List miscLines = new ArrayList() {{ + add(new VCFHeaderLine("reference", "g37")); + add(new VCFHeaderLine("GATKCommandLine", "SelectVariants and such.")); + }}; + + //Return a full metadata lines, retaining order in a LinkedHashSet. + public LinkedHashSet getFullMetaDataLinesAsSet() { + LinkedHashSet allHeaderLines = new LinkedHashSet() {{ //preserve order + addAll(fileformatLines); + addAll(filterLines); + addAll(formatLines); + addAll(infoLines); + addAll(contigLines); + addAll(miscLines); + }}; + Assert.assertEquals(allHeaderLines.size(), + fileformatLines.size() + filterLines.size() + formatLines.size() + + infoLines.size() + contigLines.size() + miscLines.size()); + return allHeaderLines; + } + + public VCFMetaDataLines getFullMetaDataLines() { + Set lineSet = getFullMetaDataLinesAsSet(); + VCFMetaDataLines md = new VCFMetaDataLines(); + md.addAllMetaDataLines(lineSet); + return md; + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFInfoHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFInfoHeaderLineUnitTest.java new file mode 100644 index 0000000000..ccb55b91be --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFInfoHeaderLineUnitTest.java @@ -0,0 +1,17 @@ +package htsjdk.variant.vcf; + +import org.testng.Assert; +import org.testng.annotations.Test; + +/** + * Test conditions that are unique to INFO lines (not covered by VCFCompoundHeaderLineUnitTest). + */ +public class VCFInfoHeaderLineUnitTest { + + @Test + public void testRepairInfoLineFlagTypeWithNonzeroCount() { + VCFInfoHeaderLine infoLine = new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION); + Assert.assertEquals(0, infoLine.getCount()); + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFMetaDataLinesUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFMetaDataLinesUnitTest.java new file mode 100644 index 0000000000..6e601c2fd1 --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFMetaDataLinesUnitTest.java @@ -0,0 +1,248 @@ +package htsjdk.variant.vcf; + +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +public class VCFMetaDataLinesUnitTest { + + @DataProvider(name="keyCollisions") + public Object[][] keyCollisions() { + return new Object[][]{ + // Unstructured key collisions + { // same key and value + new VCFHeaderLine("key", "value"), + new VCFHeaderLine("key", "value"), true + }, + { // same key, different value + new VCFHeaderLine("key", "value"), + new VCFHeaderLine("key", "value1"), false + }, + { // different key, same value + new VCFHeaderLine("key1", "value"), + new VCFHeaderLine("key2", "value"), false + }, + { // different key, different value + new VCFHeaderLine("key1", "value1"), + new VCFHeaderLine("key2", "value2"), false + }, + // Structured key collisions + { // same key, same ID + new VCFFilterHeaderLine("filterName", "unused description"), + new VCFFilterHeaderLine("filterName", "unused description"), true + }, + { // same key, different ID + new VCFFilterHeaderLine("filterName", "unused description"), + new VCFFilterHeaderLine("filterName2", "unused description"), false + }, + { // filter matches structured key/value + new VCFFilterHeaderLine("id", "unused description"), + new VCFStructuredHeaderLine("FILTER", Collections.singletonMap("ID", "id")), true + }, + { // structured key matches structured key/id + new VCFStructuredHeaderLine("FILTER", Collections.singletonMap("ID", "id")), + new VCFStructuredHeaderLine("FILTER", Collections.singletonMap("ID", "id")), true + }, + // Mixed structured/unstructured + { // key overlaps key/value + new VCFFilterHeaderLine("id", "unused description"), + new VCFHeaderLine("FILTER:id", "unused description"), false + }, + { // unstructured key matches structured "FILTER" key/value + new VCFStructuredHeaderLine("FILTER", Collections.singletonMap("ID", "id")), + new VCFHeaderLine("FILTER:id", "some value"), false + }, + }; + } + + @Test(dataProvider="keyCollisions") + public void testKeyCollision(final VCFHeaderLine line1, final VCFHeaderLine line2, final boolean expectCollision) + { + VCFMetaDataLines mdLines = new VCFMetaDataLines(); + mdLines.addMetaDataLine(line1); + mdLines.addMetaDataLine(line2); + Assert.assertEquals(mdLines.getMetaDataInInputOrder().size(), expectCollision ? 1 : 2); + } + + @Test + public void testRetainFullHeaderLines() { + VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + VCFMetaDataLines md = unitTestData.getFullMetaDataLines(); + + Assert.assertEquals(md.getMetaDataInInputOrder().size(), unitTestData.getFullMetaDataLinesAsSet().size()); + Assert.assertEquals(md.getMetaDataInSortedOrder().size(), unitTestData.getFullMetaDataLinesAsSet().size()); + + Assert.assertEquals(unitTestData.formatLines, md.getFormatHeaderLines()); + Assert.assertEquals(unitTestData.filterLines, md.getFilterLines()); + Assert.assertEquals(unitTestData.infoLines, md.getInfoHeaderLines()); + Assert.assertEquals(unitTestData.contigLines, md.getContigLines()); + Assert.assertEquals(unitTestData.filterLines, md.getFilterLines()); + + Set otherLines = new LinkedHashSet<>(); + otherLines.addAll(unitTestData.fileformatLines); + otherLines.addAll(unitTestData.miscLines); + Assert.assertEquals(otherLines, md.getOtherHeaderLines()); + } + + @Test + public void testAddRemoveOtherMetaDataLine() { + VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + VCFMetaDataLines md = unitTestData.getFullMetaDataLines(); + + int beforeAllSize = md.getMetaDataInInputOrder().size(); + int beforeStructuredSize = md.getStructuredHeaderLines().size(); + int beforeOtherSize = md.getOtherHeaderLines().size(); + + VCFHeaderLine newLine = new VCFHeaderLine("foo", "bar"); + + // add one other line + md.addMetaDataLine(newLine); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize + 1); + Assert.assertEquals(md.getStructuredHeaderLines().size(), beforeStructuredSize); // remains the same + Assert.assertEquals(md.getOtherHeaderLines().size(), beforeOtherSize + 1); + + // remove the other line and we're back to original size + Assert.assertEquals(md.removeHeaderLine(newLine), newLine); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize); + Assert.assertEquals(md.getStructuredHeaderLines().size(), beforeStructuredSize); // still remains the same + Assert.assertEquals(md.getOtherHeaderLines().size(), beforeOtherSize); + } + + @Test + public void testAddRemoveUniqueStructuredLine() { + VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + VCFMetaDataLines md = unitTestData.getFullMetaDataLines(); + + int beforeAllSize = md.getMetaDataInInputOrder().size(); + int beforeStructuredSize = md.getStructuredHeaderLines().size(); + int beforeFilterSize = md.getFilterLines().size(); + int beforeOtherSize = md.getOtherHeaderLines().size(); + + VCFFilterHeaderLine newLine = new VCFFilterHeaderLine("filterID", "unused desc"); + + // add one structured line + md.addMetaDataLine(newLine); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize + 1); + Assert.assertEquals(md.getStructuredHeaderLines().size(), beforeStructuredSize + 1); + Assert.assertEquals(md.getFilterLines().size(), beforeFilterSize + 1); + Assert.assertEquals(md.getOtherHeaderLines().size(), beforeOtherSize); // remains the same + + // remove the new line and we're back to original size + Assert.assertEquals(md.removeHeaderLine(newLine), newLine); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize); + Assert.assertEquals(md.getStructuredHeaderLines().size(), beforeStructuredSize); + Assert.assertEquals(md.getFilterLines().size(), beforeFilterSize); + Assert.assertEquals(md.getOtherHeaderLines().size(), beforeOtherSize); // still remains the same + } + + @Test + public void testAddRemoveDuplicateStructuredLine() { + VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + VCFMetaDataLines md = unitTestData.getFullMetaDataLines(); + + int beforeAllSize = md.getMetaDataInInputOrder().size(); + int beforeStructuredSize = md.getStructuredHeaderLines().size(); + int beforeFilterSize = md.getFilterLines().size(); + + VCFFilterHeaderLine newLine = new VCFFilterHeaderLine("filterID", "unused desc"); + + // add one structured (filter) line + md.addMetaDataLine(newLine); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize + 1); + Assert.assertEquals(md.getStructuredHeaderLines().size(), beforeStructuredSize + 1); + Assert.assertEquals(md.getFilterLines().size(), beforeFilterSize + 1); + + // add the same structured line again, second is rejected, count remains the same + md.addMetaDataLine(newLine); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize + 1); + Assert.assertEquals(md.getStructuredHeaderLines().size(), beforeStructuredSize + 1); + Assert.assertEquals(md.getFilterLines().size(), beforeFilterSize + 1); + Assert.assertEquals(md.getFilterHeaderLine("filterID"), newLine); + + // remove the first structured line and we're back to the original size + Assert.assertEquals(md.removeHeaderLine(newLine), newLine); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize); + Assert.assertEquals(md.getStructuredHeaderLines().size(), beforeStructuredSize); + Assert.assertEquals(md.getFilterLines().size(), beforeFilterSize); + } + + @Test + public void testGetEquivalentHeaderLine() { + VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + VCFMetaDataLines md = unitTestData.getFullMetaDataLines(); + Assert.assertEquals(md.getFilterHeaderLine( + unitTestData.filterLines.get(0).getID()), + md.hasEquivalentHeaderLine(unitTestData.filterLines.get(0))); + } + + @Test + public void testGetMetaDataLine() { + VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + VCFMetaDataLines md = unitTestData.getFullMetaDataLines(); + Assert.assertEquals( + md.getFilterHeaderLine(unitTestData.filterLines.get(0).getID()), + md.getMetaDataLine(unitTestData.filterLines.get(0).getKey())); + } + + @Test + public void testGetFilterHeaderLine() { + VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + VCFMetaDataLines md = unitTestData.getFullMetaDataLines(); + Assert.assertEquals(md.getFilterHeaderLine(unitTestData.filterLines.get(0).getID()), unitTestData.filterLines.get(0)); + } + + @Test + public void testGetInfoHeaderLine() { + VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + VCFMetaDataLines md = unitTestData.getFullMetaDataLines(); + Assert.assertEquals(md.getInfoHeaderLine(unitTestData.infoLines.get(0).getID()), unitTestData.infoLines.get(0)); + } + + @Test + public void testGetFormatHeaderLine() { + VCFHeaderUnitTestData testData = new VCFHeaderUnitTestData(); + VCFMetaDataLines md = testData.getFullMetaDataLines(); + Assert.assertEquals(md.getFormatHeaderLine(testData.formatLines.get(0).getID()), testData.formatLines.get(0)); + } + + @DataProvider(name="conflictingVCFVersions") + public Object[][] vcfVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_3} + }; + } + + @Test(dataProvider="conflictingVCFVersions", expectedExceptions = TribbleException.class) + public void testValidateMetaDataLinesConflictingVersion(final VCFHeaderVersion vcfVersion) { + VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + VCFMetaDataLines md = unitTestData.getFullMetaDataLines(); // contains a VCFv42 fileformat line + md.validateMetaDataLines(vcfVersion); + } + + @Test(dataProvider="conflictingVCFVersions", expectedExceptions = TribbleException.class) + public void testValidateMetaDataLineConflictingVersion(final VCFHeaderVersion vcfVersion) { + VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + VCFMetaDataLines md = unitTestData.getFullMetaDataLines(); // contains a VCFv42 fileformat line + md.getMetaDataInInputOrder().forEach(hl -> md.validateMetaDataLine(vcfVersion, hl)); + } + + @Test + public void testValidateMetaDataLinesValidVersion() { + VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + VCFMetaDataLines md = unitTestData.getFullMetaDataLines(); // contains a VCFv42 fileformat line + md.validateMetaDataLines(unitTestData.canonicalVersion); + } + + @Test + public void testValidateMetaDataLineVlidVersion() { + VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + VCFMetaDataLines md = unitTestData.getFullMetaDataLines(); // contains a VCFv42 fileformat line + md.getMetaDataInInputOrder().forEach(hl -> md.validateMetaDataLine(unitTestData.canonicalVersion, hl)); + } +} + diff --git a/src/test/java/htsjdk/variant/vcf/VCFMetaHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFMetaHeaderLineUnitTest.java new file mode 100644 index 0000000000..a8bbc3151a --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFMetaHeaderLineUnitTest.java @@ -0,0 +1,43 @@ +package htsjdk.variant.vcf; + +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class VCFMetaHeaderLineUnitTest { + + @DataProvider(name = "allowedVCFVersions") + public Object[][] allowedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF4_3} + }; + } + + @DataProvider(name = "rejectedVCFVersions") + public Object[][] rejectedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2}, + }; + } + + private static String metaString = ""; + + @Test(dataProvider="allowedVCFVersions") + public void testAllowedVersions(final VCFHeaderVersion vcfAllowedVersion) { + VCFMetaHeaderLine vcfLine = new VCFMetaHeaderLine(metaString, vcfAllowedVersion); + Assert.assertEquals("id", vcfLine.getID()); + Assert.assertEquals("desc", vcfLine.getGenericFieldValue(VCFStructuredHeaderLine.DESCRIPTION_ATTRIBUTE)); + } + + @Test(dataProvider="rejectedVCFVersions",expectedExceptions=TribbleException.class) + public void testRejectedVersions(final VCFHeaderVersion vcfAllowedVersion) { + new VCFMetaHeaderLine(metaString, vcfAllowedVersion); + } + + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFPedigreeHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFPedigreeHeaderLineUnitTest.java new file mode 100644 index 0000000000..9f1e84b672 --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFPedigreeHeaderLineUnitTest.java @@ -0,0 +1,44 @@ +package htsjdk.variant.vcf; + +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class VCFPedigreeHeaderLineUnitTest { + + + @DataProvider(name = "allowedVCFVersions") + public Object[][] allowedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF4_3} + }; + } + + @DataProvider(name = "rejectedVCFVersions") + public Object[][] rejectedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF3_3}, + }; + } + + private static String pedigreeString = "PEDIGREE="; + + @Test(dataProvider="allowedVCFVersions") + public void testAllowedVersions(final VCFHeaderVersion vcfAllowedVersion) { + VCFPedigreeHeaderLine vcfLine = new VCFPedigreeHeaderLine(pedigreeString, vcfAllowedVersion); + Assert.assertEquals("id", vcfLine.getID()); + Assert.assertEquals("desc", vcfLine.getGenericFieldValue(VCFStructuredHeaderLine.DESCRIPTION_ATTRIBUTE)); + } + + @Test(dataProvider="rejectedVCFVersions",expectedExceptions=TribbleException.class) + public void testRejectedVersions(final VCFHeaderVersion vcfAllowedVersion) { + new VCFPedigreeHeaderLine(pedigreeString, vcfAllowedVersion); + } + + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFSampleHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFSampleHeaderLineUnitTest.java new file mode 100644 index 0000000000..ce843731a4 --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFSampleHeaderLineUnitTest.java @@ -0,0 +1,42 @@ +package htsjdk.variant.vcf; + +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class VCFSampleHeaderLineUnitTest { + + @DataProvider(name = "allowedVCFVersions") + public Object[][] allowedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_3} + }; + } + + @DataProvider(name = "rejectedVCFVersions") + public Object[][] rejectedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF3_3}, + }; + } + + private static String sampleString = "SAMPLE="; + + @Test(dataProvider="allowedVCFVersions") + public void testAllowedVersions(final VCFHeaderVersion vcfAllowedVersion) { + VCFSampleHeaderLine vcfLine = new VCFSampleHeaderLine(sampleString, vcfAllowedVersion); + Assert.assertEquals("id", vcfLine.getID()); + Assert.assertEquals("desc", vcfLine.getGenericFieldValue(VCFStructuredHeaderLine.DESCRIPTION_ATTRIBUTE)); + } + + @Test(dataProvider="rejectedVCFVersions",expectedExceptions=TribbleException.class) + public void testRejectedVersions(final VCFHeaderVersion vcfAllowedVersion) { + new VCFSampleHeaderLine(sampleString, vcfAllowedVersion); + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFStructuredHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFStructuredHeaderLineUnitTest.java new file mode 100644 index 0000000000..f2de723157 --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFStructuredHeaderLineUnitTest.java @@ -0,0 +1,154 @@ +package htsjdk.variant.vcf; + +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.LinkedHashMap; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotNull; + +public class VCFStructuredHeaderLineUnitTest { + + private VCFStructuredHeaderLine getStructuredHeaderLine() { + return new VCFStructuredHeaderLine( + "key", + new LinkedHashMap() {{ + put("ID", "id"); + put("attr1", "value1"); + put("attr2", "value2"); + }} + ); + } + + @Test + public void testConstructorFromStrings() { + VCFStructuredHeaderLine hl = new VCFStructuredHeaderLine("testKey", "testId", "test description"); + Assert.assertEquals("testKey", hl.getKey()); + Assert.assertEquals("testId", hl.getID()); + Assert.assertEquals("test description", hl.getGenericFieldValue(VCFStructuredHeaderLine.DESCRIPTION_ATTRIBUTE)); + Assert.assertEquals("testKey=", hl.toStringEncoding()); + } + + @Test + public void testConstructorFromEncodedLine() { + VCFStructuredHeaderLine hLine = new VCFStructuredHeaderLine("key", "", VCFHeader.DEFAULT_VCF_VERSION); + Assert.assertEquals(hLine.getKey(), "key"); + Assert.assertEquals(hLine.getID(), "id"); + Assert.assertEquals(hLine.getGenericFieldValue("ID"), "id"); + Assert.assertEquals(hLine.getGenericFieldValue("attr1"), "value1"); + } + + @Test + public void testConstructorFromAttributeMap() { + VCFStructuredHeaderLine hLine = new VCFStructuredHeaderLine( + "key", + new LinkedHashMap() {{ + put("ID", "id"); + put("attr1", "value1"); + put("attr2", "value2"); + }}); + + Assert.assertEquals(hLine.getKey(), "key"); + Assert.assertEquals(hLine.getID(), "id"); + Assert.assertEquals(hLine.getGenericFieldValue("ID"), "id"); + Assert.assertEquals(hLine.getGenericFieldValue("attr1"), "value1"); + } + + @Test(expectedExceptions=TribbleException.class) + public void testRejectIdMissingFromEncodedLine() { + new VCFStructuredHeaderLine("key", "", VCFHeader.DEFAULT_VCF_VERSION); + } + + @Test(expectedExceptions=TribbleException.class) + public void testRejectIdMissingFromAttributeMap() { + new VCFStructuredHeaderLine( + "key", + new LinkedHashMap() {{ + put("attr1", "value1"); + put("attr2", "value2"); + }}); + } + + @DataProvider(name = "violateIDRequirements") + public Object[][] getViolateIDRequirements() { + return new Object[][]{ + {""}, + {""}, + {""}, + {""} + }; + } + + @Test(dataProvider="violateIDRequirements",expectedExceptions=TribbleException.class) + public void testViolateIDRequirements(final String headerLine) { + new VCFStructuredHeaderLine("key", headerLine, VCFHeader.DEFAULT_VCF_VERSION); + } + + @Test + public void testGetID() { + Assert.assertEquals(getStructuredHeaderLine().getID(), "id"); + } + + @Test + public void testIsStructuredHeaderLine() { + Assert.assertTrue(getStructuredHeaderLine().isStructuredHeaderLine()); + } + + @Test + public void testGetGenericFieldValue() { + Assert.assertEquals(getStructuredHeaderLine().getGenericFieldValue("attr1"), "value1"); + } + + @Test + public void testStringEncoding() { + final VCFStructuredHeaderLine structuredHL = getStructuredHeaderLine(); + Assert.assertEquals(structuredHL.toStringEncoding(),"key="); + } + + @Test + public void testUnescapedQuotedStringEncoding() { + VCFStructuredHeaderLine unescapedHeaderLine = new VCFStructuredHeaderLine( + "key", + new LinkedHashMap() {{ + put("ID", "id"); + put(VCFStructuredHeaderLine.DESCRIPTION_ATTRIBUTE, + "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \"NA\" || ANNOTATION <= 2.0]"); + put(VCFStructuredHeaderLine.SOURCE_ATTRIBUTE, + "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \"NA\" || ANNOTATION <= 2.0]"); + }} + ); + + final String encodedAttributes = unescapedHeaderLine.toStringEncoding(); + assertNotNull(encodedAttributes); + + final String expectedEncoding = "key="; + assertEquals(encodedAttributes, expectedEncoding); + } + + @Test + public void testEscapedQuotedStringEncoding() { + // test Source and Version attributes + VCFStructuredHeaderLine unescapedHeaderLine = new VCFStructuredHeaderLine( + "key", + new LinkedHashMap() {{ + put("ID", "id"); + put(VCFStructuredHeaderLine.DESCRIPTION_ATTRIBUTE, + "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \\\"NA\\\" || ANNOTATION <= 2.0]"); + put(VCFStructuredHeaderLine.SOURCE_ATTRIBUTE, + "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \\\"NA\\\" || ANNOTATION <= 2.0]"); + }} + ); + + final String encodedAttributes = unescapedHeaderLine.toStringEncoding(); + assertNotNull(encodedAttributes); + + final String expectedEncoding = "key="; + assertEquals(encodedAttributes, expectedEncoding); + } + +} diff --git a/src/test/resources/htsjdk/variant/HiSeq.10000.vcf b/src/test/resources/htsjdk/variant/HiSeq.10000.vcf index a304ba24da..75c9f9b537 100644 --- a/src/test/resources/htsjdk/variant/HiSeq.10000.vcf +++ b/src/test/resources/htsjdk/variant/HiSeq.10000.vcf @@ -9,7 +9,6 @@ ##FILTER= ##FILTER= ##FILTER= -##FILTER= ##FILTER= ##FORMAT= ##FORMAT= diff --git a/src/test/resources/htsjdk/variant/VCF4HeaderTest.vcf b/src/test/resources/htsjdk/variant/VCF4HeaderTest.vcf index 9af0cb3e64..097d0b034f 100644 --- a/src/test/resources/htsjdk/variant/VCF4HeaderTest.vcf +++ b/src/test/resources/htsjdk/variant/VCF4HeaderTest.vcf @@ -9,7 +9,6 @@ ##FILTER= ##FILTER= ##FILTER= -##FILTER= ##FILTER= ##FILTER= ##FILTER=