Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support VCF v4.3 (read only). #1359

Merged
merged 5 commits into from
May 31, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/main/java/htsjdk/variant/variantcontext/writer/VCFWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,8 @@ public static VCFHeader writeHeader(VCFHeader header,
final String streamNameForError) {

try {
rejectVCFV43Headers(header);

// the file format field needs to be written first
writer.write(versionLine + "\n");

Expand Down Expand Up @@ -258,10 +260,21 @@ public void add(final VariantContext context) {

@Override
public void setHeader(final VCFHeader header) {
rejectVCFV43Headers(header);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why check here as well as in write? Is it illegal to create a VCFWriter in this state, even if it's never used to write a VCF?


if (outputHasBeenWritten) {
throw new IllegalStateException("The header cannot be modified after the header or variants have been written to the output stream.");
}
this.mHeader = doNotWriteGenotypes ? new VCFHeader(header.getMetaDataInSortedOrder()) : header;
this.vcfEncoder = new VCFEncoder(this.mHeader, this.allowMissingFieldsInHeader, this.writeFullFormatField);
}

// writing vcf v4.3 is not implemented
private static void rejectVCFV43Headers(final VCFHeader targetHeader) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the interest of adding future checks, how about calling this ensureValidToWrite() or something like that? Personally, I use 'ensure' for (non-testing) methods that assert a state and throw an error if the state isn't valid. 'check' is for methods that assert but return a result code instead of throwing. Then the specific comment about checking for 4.3 can be moved below, to the if.

if (targetHeader.getVCFHeaderVersion() != null && targetHeader.getVCFHeaderVersion().isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) {
throw new IllegalArgumentException(String.format("Writing VCF version %s is not implemented", targetHeader.getVCFHeaderVersion()));
}


}
}
126 changes: 111 additions & 15 deletions src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,10 @@
import htsjdk.tribble.TribbleException;
import htsjdk.tribble.index.tabix.TabixFormat;
import htsjdk.tribble.util.ParsingUtils;
import htsjdk.utils.ValidationUtils;
import htsjdk.variant.utils.GeneralUtils;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.VCFConstants;

import java.io.FileNotFoundException;
import java.io.IOException;
Expand All @@ -44,7 +46,6 @@
import java.util.*;
import java.util.zip.GZIPInputStream;


public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext> implements NameAwareCodec {
public final static int MAX_ALLELE_SIZE_BEFORE_WARNING = (int)Math.pow(2, 20);

Expand All @@ -53,6 +54,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
// we have to store the list of strings that make up the header until they're needed
protected VCFHeader header = null;
protected VCFHeaderVersion version = null;
protected VCFTextTransformer vcfTextTransformer = getTextTransformerForVCFVersion(VCFHeaderVersion.VCF4_2);

// a mapping of the allele
protected Map<String, List<Allele>> alleleMap = new HashMap<String, List<Allele>>(3);
Expand Down Expand Up @@ -132,6 +134,7 @@ public LazyGenotypesContext.LazyData parse(final Object data) {
*/
protected VCFHeader parseHeaderFromLines( final List<String> headerStrings, final VCFHeaderVersion version ) {
this.version = version;
vcfTextTransformer = getTextTransformerForVCFVersion(version);

Set<VCFHeaderLine> metaData = new LinkedHashSet<VCFHeaderLine>();
Set<String> sampleNames = new LinkedHashSet<String>();
Expand Down Expand Up @@ -196,8 +199,13 @@ protected VCFHeader parseHeaderFromLines( final List<String> headerStrings, fina
final VCFContigHeaderLine contig = new VCFContigHeaderLine(str.substring(9), version, VCFConstants.CONTIG_HEADER_START.substring(2), contigCounter++);
metaData.add(contig);
} else if ( str.startsWith(VCFConstants.ALT_HEADER_START) ) {
final VCFSimpleHeaderLine alt = new VCFSimpleHeaderLine(str.substring(6), version, VCFConstants.ALT_HEADER_START.substring(2), Arrays.asList("ID", "Description"), Collections.emptyList());
metaData.add(alt);
metaData.add(getAltHeaderLine(str.substring(VCFConstants.ALT_HEADER_OFFSET), version));
} else if ( str.startsWith(VCFConstants.PEDIGREE_HEADER_START) ) {
metaData.add(getPedigreeHeaderLine(str.substring(VCFConstants.PEDIGREE_HEADER_OFFSET), version));
} else if ( str.startsWith(VCFConstants.META_HEADER_START) ) {
metaData.add(getMetaHeaderLine(str.substring(VCFConstants.META_HEADER_OFFSET), version));
} else if ( str.startsWith(VCFConstants.SAMPLE_HEADER_START) ) {
metaData.add(getSampleHeaderLine(str.substring(VCFConstants.SAMPLE_HEADER_OFFSET), version));
} else {
int equals = str.indexOf('=');
if ( equals != -1 )
Expand All @@ -206,9 +214,7 @@ protected VCFHeader parseHeaderFromLines( final List<String> headerStrings, fina
}
}

this.header = new VCFHeader(metaData, sampleNames);
if ( doOnTheFlyModifications )
this.header = VCFStandardHeaderLines.repairStandardHeaderLines(this.header);
setVCFHeader(new VCFHeader(version, metaData, sampleNames), version);
return this.header;
}

Expand All @@ -230,21 +236,76 @@ public VCFHeaderVersion getVersion() {
/**
* Explicitly set the VCFHeader on this codec. This will overwrite the header read from the file
* and the version state stored in this instance; conversely, reading the header from a file will
* overwrite whatever is set here. The returned header may not be identical to the header argument
* since the header lines may be "repaired" (i.e., rewritten) if doOnTheFlyModifications is set.
* overwrite whatever is set here.
*
* @param newHeader
* @param newVersion
* @return the actual header for this codec. The returned header may not be identical to the header
* argument since the header lines may be "repaired" (i.e., rewritten) if doOnTheFlyModifications is set.
* @throws TribbleException if the requested header version is not compatible with the existing version
*/
public VCFHeader setVCFHeader(final VCFHeader header, final VCFHeaderVersion version) {
this.version = version;

public VCFHeader setVCFHeader(final VCFHeader newHeader, final VCFHeaderVersion newVersion) {
validateHeaderVersionTransition(newHeader, newVersion);
if (this.doOnTheFlyModifications) {
this.header = VCFStandardHeaderLines.repairStandardHeaderLines(header);
final VCFHeader repairedHeader = VCFStandardHeaderLines.repairStandardHeaderLines(newHeader);
// validate the new header after repair to ensure the resulting header version is
// still compatible with the current version
validateHeaderVersionTransition(repairedHeader, newVersion);
this.header = repairedHeader;
} else {
this.header = header;
this.header = newHeader;
}

this.version = newVersion;
this.vcfTextTransformer = getTextTransformerForVCFVersion(newVersion);

return this.header;
}

/**
* Create and return a VCFAltHeaderLine object from a header line string that conforms to the {@code sourceVersion}
* @param headerLineString VCF header line being parsed without the leading "##ALT="
* @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header
* line object should be validate for this header version.
* @return a VCFAltHeaderLine object
*/
public VCFAltHeaderLine getAltHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) {
return new VCFAltHeaderLine(headerLineString, sourceVersion);
}

/**
* Create and return a VCFPedigreeHeaderLine object from a header line string that conforms to the {@code sourceVersion}
* @param headerLineString VCF header line being parsed without the leading "##PEDIGREE="
* @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header
* line object should be validate for this header version.
* @return a VCFPedigreeHeaderLine object
*/
public VCFPedigreeHeaderLine getPedigreeHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) {
return new VCFPedigreeHeaderLine(headerLineString, sourceVersion);
}

/**
* Create and return a VCFMetaHeaderLine object from a header line string that conforms to the {@code sourceVersion}
* @param headerLineString VCF header line being parsed without the leading "##META="
* @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header
* line object should be validate for this header version.
* @return a VCFMetaHeaderLine object
*/
public VCFMetaHeaderLine getMetaHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) {
return new VCFMetaHeaderLine(headerLineString, sourceVersion);
}

/**
* Create and return a VCFSampleHeaderLine object from a header line string that conforms to the {@code sourceVersion}
* @param headerLineString VCF header line being parsed without the leading "##SAMPLE="
* @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header
* line object should be validate for this header version.
* @return a VCFSampleHeaderLine object
*/
public VCFSampleHeaderLine getSampleHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) {
return new VCFSampleHeaderLine(headerLineString, sourceVersion);
}

/**
* the fast decode function
* @param line the line of text for the record
Expand All @@ -264,6 +325,40 @@ public VariantContext decode(String line) {
return decodeLine(line, true);
}

/**
* Throw if new a version/header are not compatible with the existing version/header. Generally, any version
* before v4.2 can be up-converted to v4.2, but not to v4.3. Once a header is established as v4.3, it cannot
* can not be up or down converted, and it must remain at v4.3.
* @param newHeader
* @param newVersion
* @throws TribbleException if the header conversion is not valid
*/
private void validateHeaderVersionTransition(final VCFHeader newHeader, final VCFHeaderVersion newVersion) {
ValidationUtils.nonNull(newHeader);
ValidationUtils.nonNull(newVersion);

VCFHeader.validateVersionTransition(version, newVersion);

// If this codec currently has no header (this happens when the header is being established for
// the first time during file parsing), establish an initial header and version, and bypass
// validation.
if (header != null && newHeader.getVCFHeaderVersion() != null) {
VCFHeader.validateVersionTransition(header.getVCFHeaderVersion(), newHeader.getVCFHeaderVersion());
}
}

/**
* For v4.3 up, attribute values can contain embedded percent-encoded characters which must be decoded
* on read. Return a version-aware text transformer that can decode encoded text.
* @param targetVersion the version for which a transformer is bing requested
* @return a {@link VCFTextTransformer} suitable for the targetVersion
*/
private static VCFTextTransformer getTextTransformerForVCFVersion(final VCFHeaderVersion targetVersion) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this is not overridable, what about moving it to VCFTextTransformer and make it a factory method, e.g. VCFTextTransformer.fromVersion().

Another possible place for this is VCFHeaderVersion, e.g. VCFHeaderVersion.createTextTransformer().

return targetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3) ?
new VCFTextTransformer() :
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these could be constants since they have no state

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

new VCFPassThruTextTransformer();
}

private VariantContext decodeLine(final String line, final boolean includeGenotypes) {
// the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line
if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null;
Expand Down Expand Up @@ -429,14 +524,14 @@ private Map<String, Object> parseInfo(String infoField) {
// split on the INFO field separator
List<String> infoValueSplit = ParsingUtils.split(valueString, VCFConstants.INFO_FIELD_ARRAY_SEPARATOR_CHAR);
if ( infoValueSplit.size() == 1 ) {
value = infoValueSplit.get(0);
value = vcfTextTransformer.transformEncodedText(infoValueSplit.get(0));
final VCFInfoHeaderLine headerLine = header.getInfoHeaderLine(key);
if ( headerLine != null && headerLine.getType() == VCFHeaderLineType.Flag && value.equals("0") ) {
// deal with the case where a flag field has =0, such as DB=0, by skipping the add
continue;
}
} else {
value = infoValueSplit;
value = vcfTextTransformer.transformEncodedText(infoValueSplit);
}
} else {
key = infoFields.get(i);
Expand Down Expand Up @@ -675,6 +770,7 @@ public LazyGenotypesContext.LazyData createGenotypeMap(final String str,
boolean PlIsSet = false;
for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) {
List<String> genotypeValues = ParsingUtils.split(genotypeParts[genotypeOffset], VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
genotypeValues = vcfTextTransformer.transformEncodedText(genotypeValues);

final String sampleName = sampleNameIterator.next();
final GenotypeBuilder gb = new GenotypeBuilder(sampleName);
Expand Down
22 changes: 22 additions & 0 deletions src/main/java/htsjdk/variant/vcf/VCFAltHeaderLine.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package htsjdk.variant.vcf;

import java.util.*;

/**
* A class representing ALT fields in the VCF header
*/
public class VCFAltHeaderLine extends VCFSimpleHeaderLine {
private static final long serialVersionUID = 1L;

private static List<String> expectedTags = Collections.unmodifiableList(
new ArrayList<String>(2) {{
add(ID_ATTRIBUTE);
add(DESCRIPTION_ATTRIBUTE);
}}
);

public VCFAltHeaderLine(final String line, final VCFHeaderVersion version) {
super(VCFConstants.ALT_HEADER_KEY, new VCF4Parser().parseLine(line, expectedTags));
}

}
2 changes: 1 addition & 1 deletion src/main/java/htsjdk/variant/vcf/VCFCodec.java
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ public Object readActualHeader(final LineIterator lineIterator) {
version = VCFHeaderVersion.toHeaderVersion(lineFields[1]);
if ( ! version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) )
throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4; please use the VCF3 codec for " + lineFields[1]);
if ( version != VCFHeaderVersion.VCF4_0 && version != VCFHeaderVersion.VCF4_1 && version != VCFHeaderVersion.VCF4_2 )
if ( version != VCFHeaderVersion.VCF4_0 && version != VCFHeaderVersion.VCF4_1 && version != VCFHeaderVersion.VCF4_2 && version != VCFHeaderVersion.VCF4_3)
throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4 and does not support " + lineFields[1]);
}
headerStrings.add(lineIterator.next());
Expand Down
17 changes: 16 additions & 1 deletion src/main/java/htsjdk/variant/vcf/VCFConstants.java
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,25 @@ public final class VCFConstants {
public static final String FILTER_HEADER_START = "##FILTER";
public static final String FORMAT_HEADER_START = "##FORMAT";
public static final String INFO_HEADER_START = "##INFO";
public static final String ALT_HEADER_START = "##ALT";
public static final String ALT_HEADER_KEY = "ALT";
public static final String ALT_HEADER_START = VCFHeader.METADATA_INDICATOR + ALT_HEADER_KEY ;
public static final String CONTIG_HEADER_KEY = "contig";
public static final String CONTIG_HEADER_START = "##" + CONTIG_HEADER_KEY;

public static final int ALT_HEADER_OFFSET = ALT_HEADER_START.length() + 1;

public static final String PEDIGREE_HEADER_KEY = "PEDIGREE";
public static final String PEDIGREE_HEADER_START = VCFHeader.METADATA_INDICATOR + PEDIGREE_HEADER_KEY;
public static final int PEDIGREE_HEADER_OFFSET = PEDIGREE_HEADER_START.length() + 1;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for not continuing the use of magic numbers here!


public static final String SAMPLE_HEADER_KEY = "SAMPLE";
public static final String SAMPLE_HEADER_START = VCFHeader.METADATA_INDICATOR + SAMPLE_HEADER_KEY;
public static final int SAMPLE_HEADER_OFFSET = SAMPLE_HEADER_START.length() + 1;

public static final String META_HEADER_KEY = "META";
public static final String META_HEADER_START = VCFHeader.METADATA_INDICATOR + META_HEADER_KEY;
public static final int META_HEADER_OFFSET = META_HEADER_START.length() + 1;

// old indel alleles
public static final char DELETION_ALLELE_v3 = 'D';
public static final char INSERTION_ALLELE_v3 = 'I';
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/htsjdk/variant/vcf/VCFEncoder.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
public class VCFEncoder {

/**
* The encoding used for VCF files: ISO-8859-1
* The encoding used for VCF files: ISO-8859-1. When writing VCF4.3 is implemented, this should change to UTF-8.
*/
public static final Charset VCF_CHARSET = Charset.forName("ISO-8859-1");
private static final String QUAL_FORMAT_STRING = "%.2f";
Expand Down
Loading