Skip to content

Commit

Permalink
Support VCF v4.3 (read only). (#1359)
Browse files Browse the repository at this point in the history
* Support VCF v4.3 (read only).
* Add UTF-8 source declarations for compile and javadoc tasks.
  • Loading branch information
cmnbroad authored and lbergelson committed May 31, 2019
1 parent 1e3f1fa commit d5ac863
Show file tree
Hide file tree
Showing 30 changed files with 1,200 additions and 53 deletions.
6 changes: 6 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,13 @@ group = 'com.github.samtools'

defaultTasks 'jar'

tasks.withType(JavaCompile) {
options.encoding = 'UTF-8'
}

tasks.withType(Javadoc) {
options.addStringOption('encoding', 'UTF-8')
}

jar {
manifest {
Expand Down
13 changes: 13 additions & 0 deletions src/main/java/htsjdk/variant/variantcontext/writer/VCFWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,8 @@ public static VCFHeader writeHeader(VCFHeader header,
final String streamNameForError) {

try {
rejectVCFV43Headers(header);

// the file format field needs to be written first
writer.write(versionLine + "\n");

Expand Down Expand Up @@ -258,10 +260,21 @@ public void add(final VariantContext context) {

@Override
public void setHeader(final VCFHeader header) {
rejectVCFV43Headers(header);

if (outputHasBeenWritten) {
throw new IllegalStateException("The header cannot be modified after the header or variants have been written to the output stream.");
}
this.mHeader = doNotWriteGenotypes ? new VCFHeader(header.getMetaDataInSortedOrder()) : header;
this.vcfEncoder = new VCFEncoder(this.mHeader, this.allowMissingFieldsInHeader, this.writeFullFormatField);
}

// writing vcf v4.3 is not implemented
private static void rejectVCFV43Headers(final VCFHeader targetHeader) {
if (targetHeader.getVCFHeaderVersion() != null && targetHeader.getVCFHeaderVersion().isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) {
throw new IllegalArgumentException(String.format("Writing VCF version %s is not implemented", targetHeader.getVCFHeaderVersion()));
}


}
}
128 changes: 113 additions & 15 deletions src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import htsjdk.tribble.TribbleException;
import htsjdk.tribble.index.tabix.TabixFormat;
import htsjdk.tribble.util.ParsingUtils;
import htsjdk.utils.ValidationUtils;
import htsjdk.variant.utils.GeneralUtils;
import htsjdk.variant.variantcontext.*;

Expand All @@ -44,7 +45,6 @@
import java.util.*;
import java.util.zip.GZIPInputStream;


public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext> implements NameAwareCodec {
public final static int MAX_ALLELE_SIZE_BEFORE_WARNING = (int)Math.pow(2, 20);

Expand All @@ -54,6 +54,11 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
protected VCFHeader header = null;
protected VCFHeaderVersion version = null;

private final static VCFTextTransformer percentEncodingTextTransformer = new VCFPercentEncodedTextTransformer();
private final static VCFTextTransformer passThruTextTransformer = new VCFPassThruTextTransformer();
//by default, we use the passThruTextTransformer (assume pre v4.3)
private VCFTextTransformer vcfTextTransformer = passThruTextTransformer;

// a mapping of the allele
protected Map<String, List<Allele>> alleleMap = new HashMap<String, List<Allele>>(3);

Expand Down Expand Up @@ -196,8 +201,13 @@ protected VCFHeader parseHeaderFromLines( final List<String> headerStrings, fina
final VCFContigHeaderLine contig = new VCFContigHeaderLine(str.substring(9), version, VCFConstants.CONTIG_HEADER_START.substring(2), contigCounter++);
metaData.add(contig);
} else if ( str.startsWith(VCFConstants.ALT_HEADER_START) ) {
final VCFSimpleHeaderLine alt = new VCFSimpleHeaderLine(str.substring(6), version, VCFConstants.ALT_HEADER_START.substring(2), Arrays.asList("ID", "Description"), Collections.emptyList());
metaData.add(alt);
metaData.add(getAltHeaderLine(str.substring(VCFConstants.ALT_HEADER_OFFSET), version));
} else if ( str.startsWith(VCFConstants.PEDIGREE_HEADER_START) ) {
metaData.add(getPedigreeHeaderLine(str.substring(VCFConstants.PEDIGREE_HEADER_OFFSET), version));
} else if ( str.startsWith(VCFConstants.META_HEADER_START) ) {
metaData.add(getMetaHeaderLine(str.substring(VCFConstants.META_HEADER_OFFSET), version));
} else if ( str.startsWith(VCFConstants.SAMPLE_HEADER_START) ) {
metaData.add(getSampleHeaderLine(str.substring(VCFConstants.SAMPLE_HEADER_OFFSET), version));
} else {
int equals = str.indexOf('=');
if ( equals != -1 )
Expand All @@ -206,9 +216,7 @@ protected VCFHeader parseHeaderFromLines( final List<String> headerStrings, fina
}
}

this.header = new VCFHeader(metaData, sampleNames);
if ( doOnTheFlyModifications )
this.header = VCFStandardHeaderLines.repairStandardHeaderLines(this.header);
setVCFHeader(new VCFHeader(version, metaData, sampleNames), version);
return this.header;
}

Expand All @@ -230,21 +238,76 @@ public VCFHeaderVersion getVersion() {
/**
* Explicitly set the VCFHeader on this codec. This will overwrite the header read from the file
* and the version state stored in this instance; conversely, reading the header from a file will
* overwrite whatever is set here. The returned header may not be identical to the header argument
* since the header lines may be "repaired" (i.e., rewritten) if doOnTheFlyModifications is set.
* overwrite whatever is set here.
*
* @param newHeader
* @param newVersion
* @return the actual header for this codec. The returned header may not be identical to the header
* argument since the header lines may be "repaired" (i.e., rewritten) if doOnTheFlyModifications is set.
* @throws TribbleException if the requested header version is not compatible with the existing version
*/
public VCFHeader setVCFHeader(final VCFHeader header, final VCFHeaderVersion version) {
this.version = version;

public VCFHeader setVCFHeader(final VCFHeader newHeader, final VCFHeaderVersion newVersion) {
validateHeaderVersionTransition(newHeader, newVersion);
if (this.doOnTheFlyModifications) {
this.header = VCFStandardHeaderLines.repairStandardHeaderLines(header);
final VCFHeader repairedHeader = VCFStandardHeaderLines.repairStandardHeaderLines(newHeader);
// validate the new header after repair to ensure the resulting header version is
// still compatible with the current version
validateHeaderVersionTransition(repairedHeader, newVersion);
this.header = repairedHeader;
} else {
this.header = header;
this.header = newHeader;
}

this.version = newVersion;
this.vcfTextTransformer = getTextTransformerForVCFVersion(newVersion);

return this.header;
}

/**
* Create and return a VCFAltHeaderLine object from a header line string that conforms to the {@code sourceVersion}
* @param headerLineString VCF header line being parsed without the leading "##ALT="
* @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header
* line object should be validate for this header version.
* @return a VCFAltHeaderLine object
*/
public VCFAltHeaderLine getAltHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) {
return new VCFAltHeaderLine(headerLineString, sourceVersion);
}

/**
* Create and return a VCFPedigreeHeaderLine object from a header line string that conforms to the {@code sourceVersion}
* @param headerLineString VCF header line being parsed without the leading "##PEDIGREE="
* @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header
* line object should be validate for this header version.
* @return a VCFPedigreeHeaderLine object
*/
public VCFPedigreeHeaderLine getPedigreeHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) {
return new VCFPedigreeHeaderLine(headerLineString, sourceVersion);
}

/**
* Create and return a VCFMetaHeaderLine object from a header line string that conforms to the {@code sourceVersion}
* @param headerLineString VCF header line being parsed without the leading "##META="
* @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header
* line object should be validate for this header version.
* @return a VCFMetaHeaderLine object
*/
public VCFMetaHeaderLine getMetaHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) {
return new VCFMetaHeaderLine(headerLineString, sourceVersion);
}

/**
* Create and return a VCFSampleHeaderLine object from a header line string that conforms to the {@code sourceVersion}
* @param headerLineString VCF header line being parsed without the leading "##SAMPLE="
* @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header
* line object should be validate for this header version.
* @return a VCFSampleHeaderLine object
*/
public VCFSampleHeaderLine getSampleHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) {
return new VCFSampleHeaderLine(headerLineString, sourceVersion);
}

/**
* the fast decode function
* @param line the line of text for the record
Expand All @@ -264,6 +327,40 @@ public VariantContext decode(String line) {
return decodeLine(line, true);
}

/**
* Throw if new a version/header are not compatible with the existing version/header. Generally, any version
* before v4.2 can be up-converted to v4.2, but not to v4.3. Once a header is established as v4.3, it cannot
* can not be up or down converted, and it must remain at v4.3.
* @param newHeader
* @param newVersion
* @throws TribbleException if the header conversion is not valid
*/
private void validateHeaderVersionTransition(final VCFHeader newHeader, final VCFHeaderVersion newVersion) {
ValidationUtils.nonNull(newHeader);
ValidationUtils.nonNull(newVersion);

VCFHeader.validateVersionTransition(version, newVersion);

// If this codec currently has no header (this happens when the header is being established for
// the first time during file parsing), establish an initial header and version, and bypass
// validation.
if (header != null && newHeader.getVCFHeaderVersion() != null) {
VCFHeader.validateVersionTransition(header.getVCFHeaderVersion(), newHeader.getVCFHeaderVersion());
}
}

/**
* For v4.3 up, attribute values can contain embedded percent-encoded characters which must be decoded
* on read. Return a version-aware text transformer that can decode encoded text.
* @param targetVersion the version for which a transformer is bing requested
* @return a {@link VCFTextTransformer} suitable for the targetVersion
*/
private VCFTextTransformer getTextTransformerForVCFVersion(final VCFHeaderVersion targetVersion) {
return targetVersion != null && targetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3) ?
percentEncodingTextTransformer :
passThruTextTransformer;
}

private VariantContext decodeLine(final String line, final boolean includeGenotypes) {
// the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line
if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null;
Expand Down Expand Up @@ -429,14 +526,14 @@ private Map<String, Object> parseInfo(String infoField) {
// split on the INFO field separator
List<String> infoValueSplit = ParsingUtils.split(valueString, VCFConstants.INFO_FIELD_ARRAY_SEPARATOR_CHAR);
if ( infoValueSplit.size() == 1 ) {
value = infoValueSplit.get(0);
value = vcfTextTransformer.decodeText(infoValueSplit.get(0));
final VCFInfoHeaderLine headerLine = header.getInfoHeaderLine(key);
if ( headerLine != null && headerLine.getType() == VCFHeaderLineType.Flag && value.equals("0") ) {
// deal with the case where a flag field has =0, such as DB=0, by skipping the add
continue;
}
} else {
value = infoValueSplit;
value = vcfTextTransformer.decodeText(infoValueSplit);
}
} else {
key = infoFields.get(i);
Expand Down Expand Up @@ -675,6 +772,7 @@ public LazyGenotypesContext.LazyData createGenotypeMap(final String str,
boolean PlIsSet = false;
for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) {
List<String> genotypeValues = ParsingUtils.split(genotypeParts[genotypeOffset], VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
genotypeValues = vcfTextTransformer.decodeText(genotypeValues);

final String sampleName = sampleNameIterator.next();
final GenotypeBuilder gb = new GenotypeBuilder(sampleName);
Expand Down
22 changes: 22 additions & 0 deletions src/main/java/htsjdk/variant/vcf/VCFAltHeaderLine.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package htsjdk.variant.vcf;

import java.util.*;

/**
* A class representing ALT fields in the VCF header
*/
public class VCFAltHeaderLine extends VCFSimpleHeaderLine {
private static final long serialVersionUID = 1L;

private static List<String> expectedTags = Collections.unmodifiableList(
new ArrayList<String>(2) {{
add(ID_ATTRIBUTE);
add(DESCRIPTION_ATTRIBUTE);
}}
);

public VCFAltHeaderLine(final String line, final VCFHeaderVersion version) {
super(VCFConstants.ALT_HEADER_KEY, new VCF4Parser().parseLine(line, expectedTags));
}

}
2 changes: 1 addition & 1 deletion src/main/java/htsjdk/variant/vcf/VCFCodec.java
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ public Object readActualHeader(final LineIterator lineIterator) {
version = VCFHeaderVersion.toHeaderVersion(lineFields[1]);
if ( ! version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) )
throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4; please use the VCF3 codec for " + lineFields[1]);
if ( version != VCFHeaderVersion.VCF4_0 && version != VCFHeaderVersion.VCF4_1 && version != VCFHeaderVersion.VCF4_2 )
if ( version != VCFHeaderVersion.VCF4_0 && version != VCFHeaderVersion.VCF4_1 && version != VCFHeaderVersion.VCF4_2 && version != VCFHeaderVersion.VCF4_3)
throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4 and does not support " + lineFields[1]);
}
headerStrings.add(lineIterator.next());
Expand Down
17 changes: 16 additions & 1 deletion src/main/java/htsjdk/variant/vcf/VCFConstants.java
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,25 @@ public final class VCFConstants {
public static final String FILTER_HEADER_START = "##FILTER";
public static final String FORMAT_HEADER_START = "##FORMAT";
public static final String INFO_HEADER_START = "##INFO";
public static final String ALT_HEADER_START = "##ALT";
public static final String ALT_HEADER_KEY = "ALT";
public static final String ALT_HEADER_START = VCFHeader.METADATA_INDICATOR + ALT_HEADER_KEY ;
public static final String CONTIG_HEADER_KEY = "contig";
public static final String CONTIG_HEADER_START = "##" + CONTIG_HEADER_KEY;

public static final int ALT_HEADER_OFFSET = ALT_HEADER_START.length() + 1;

public static final String PEDIGREE_HEADER_KEY = "PEDIGREE";
public static final String PEDIGREE_HEADER_START = VCFHeader.METADATA_INDICATOR + PEDIGREE_HEADER_KEY;
public static final int PEDIGREE_HEADER_OFFSET = PEDIGREE_HEADER_START.length() + 1;

public static final String SAMPLE_HEADER_KEY = "SAMPLE";
public static final String SAMPLE_HEADER_START = VCFHeader.METADATA_INDICATOR + SAMPLE_HEADER_KEY;
public static final int SAMPLE_HEADER_OFFSET = SAMPLE_HEADER_START.length() + 1;

public static final String META_HEADER_KEY = "META";
public static final String META_HEADER_START = VCFHeader.METADATA_INDICATOR + META_HEADER_KEY;
public static final int META_HEADER_OFFSET = META_HEADER_START.length() + 1;

// old indel alleles
public static final char DELETION_ALLELE_v3 = 'D';
public static final char INSERTION_ALLELE_v3 = 'I';
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/htsjdk/variant/vcf/VCFEncoder.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
public class VCFEncoder {

/**
* The encoding used for VCF files: ISO-8859-1
* The encoding used for VCF files: ISO-8859-1. When writing VCF4.3 is implemented, this should change to UTF-8.
*/
public static final Charset VCF_CHARSET = Charset.forName("ISO-8859-1");
private static final String QUAL_FORMAT_STRING = "%.2f";
Expand Down
Loading

0 comments on commit d5ac863

Please sign in to comment.