-
Notifications
You must be signed in to change notification settings - Fork 594
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
VariantsToTable: Include all fields when none specified #7911
Changes from 6 commits
f20763f
3e9d123
dc7ff59
0a6d1ef
8a33af3
9b956ba
cb09924
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,17 +2,14 @@ | |
|
||
import htsjdk.variant.variantcontext.Allele; | ||
import htsjdk.variant.variantcontext.VariantContext; | ||
import htsjdk.variant.vcf.VCFConstants; | ||
import htsjdk.variant.vcf.VCFHeader; | ||
import htsjdk.variant.vcf.VCFHeaderLineCount; | ||
import htsjdk.variant.vcf.*; | ||
import org.apache.logging.log4j.LogManager; | ||
import org.apache.logging.log4j.Logger; | ||
import org.broadinstitute.barclay.argparser.Advanced; | ||
import org.broadinstitute.barclay.argparser.Argument; | ||
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; | ||
import org.broadinstitute.barclay.help.DocumentedFeature; | ||
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; | ||
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; | ||
import picard.cmdline.programgroups.VariantEvaluationProgramGroup; | ||
import org.broadinstitute.hellbender.engine.FeatureContext; | ||
import org.broadinstitute.hellbender.engine.ReadsContext; | ||
|
@@ -38,7 +35,8 @@ | |
* This tool extracts specified fields for each variant in a VCF file to a tab-delimited table, which may be easier | ||
* to work with than a VCF. By default, the tool only extracts PASS or . (unfiltered) variants in the VCF file. Filtered variants may be | ||
* included in the output by adding the --show-filtered flag. The tool can extract both INFO (i.e. site-level) fields and | ||
* FORMAT (i.e. sample-level) fields. | ||
* FORMAT (i.e. sample-level) fields. If the tool is run without specifying any fields, it defaults to include all fields | ||
* declared in the VCF header. | ||
* </p> | ||
* | ||
* <h4>INFO/site-level fields</h4> | ||
|
@@ -100,6 +98,12 @@ | |
* 1 65068538 SNP 49,0 35,4 | ||
* 1 111146235 SNP 69,1 77,4 | ||
* </pre> | ||
* <pre> | ||
* gatk VariantsToTable \ | ||
* -V input.vcf \ | ||
* -O output.table | ||
* </pre> | ||
* <p>would produce a file that includes all fields declared in the VCF header.</p> | ||
* | ||
* <h3>Notes</h3> | ||
* <ul> | ||
|
@@ -212,9 +216,39 @@ public void onTraversalStart() { | |
inputHeader = getHeaderForVariants(); | ||
outputStream = createPrintStream(); | ||
|
||
// if no fields specified, default to include all fields listed in header into table | ||
if(fieldsToTake.isEmpty() && genotypeFieldsToTake.isEmpty() && asFieldsToTake.isEmpty() && asGenotypeFieldsToTake.isEmpty()){ | ||
logger.warn("No fields were specified. All fields declared in the VCF header will be included in the output table."); | ||
|
||
// add all mandatory VCF fields (except INFO) | ||
for(VCFHeader.HEADER_FIELDS headerField : VCFHeader.HEADER_FIELDS.values()){ | ||
if(!headerField.name().equals(VCFHeader.HEADER_FIELDS.INFO.name())) { | ||
fieldsToTake.add(headerField.name()); | ||
} | ||
} | ||
|
||
// add all INFO fields present in VCF header | ||
for (final VCFInfoHeaderLine infoLine : inputHeader.getInfoHeaderLines()) { | ||
fieldsToTake.add(infoLine.getID()); | ||
} | ||
|
||
// add all FORMAT fields present in VCF header | ||
for (final VCFFormatHeaderLine formatLine : inputHeader.getFormatHeaderLines()) { | ||
// ensure GT field listed as first FORMAT field | ||
if(formatLine.getID().equals(VCFConstants.GENOTYPE_KEY)) { | ||
genotypeFieldsToTake.add(0, formatLine.getID()); | ||
} | ||
else { | ||
genotypeFieldsToTake.add(formatLine.getID()); | ||
} | ||
} | ||
} | ||
|
||
// if fields specified, but none are genotype fields, set samples to empty | ||
if (genotypeFieldsToTake.isEmpty() && asGenotypeFieldsToTake.isEmpty()) { | ||
samples = Collections.emptySortedSet(); | ||
} else { | ||
samples = Collections.emptySortedSet(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This line is now indented one level too deep. |
||
} | ||
else { | ||
final Map<String, VCFHeader> vcfHeaders = Collections.singletonMap(getDrivingVariantsFeatureInput().getName(), getHeaderForVariants()); | ||
samples = VcfUtils.getSortedSampleSet(vcfHeaders, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); | ||
|
||
|
@@ -238,6 +272,7 @@ public void onTraversalStart() { | |
outputStream.println("RecordID\tSample\tVariable\tValue"); | ||
} else { | ||
final List<String> fields = new ArrayList<>(); | ||
|
||
fields.addAll(fieldsToTake); | ||
fields.addAll(asFieldsToTake); | ||
fields.addAll(createGenotypeFields()); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
import org.broadinstitute.hellbender.testutils.IntegrationTestSpec; | ||
import org.testng.annotations.Test; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
import java.util.Arrays; | ||
|
||
|
@@ -236,4 +237,44 @@ public void testMoltenOutputWithMultipleAlleles() throws IOException { | |
spec.setTrimWhiteSpace(false); | ||
spec.executeTest("testMoltenOutputWithMultipleAlleles", this); | ||
} | ||
|
||
@Test | ||
public void testNoFieldsSpecified() throws IOException { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test case should probably be renamed |
||
final File inputFile = new File(getToolTestDataDir(), "VCFWithoutGenotypes_dbsnp_138.snippet.vcf"); | ||
final File outputFile = createTempFile("noFieldsSpecifiedOutput", ".table"); | ||
final File expectedFile = new File(getToolTestDataDir(), "expected.noFieldsSpecified.table"); | ||
|
||
final String[] args = new String[] {"--variant", inputFile.getAbsolutePath(), | ||
"-O", outputFile.getAbsolutePath()}; | ||
runCommandLine(args); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These tests don't actually assert anything. Typically (even for "this shouldn't crash" type tests) we would want to assert that the files match the expected outputs. It looks like you are most of the way there though. You already have checked in some output files that look reasonable, now all you need to do is check that the output of THIS run matches those. One way to do that is with |
||
|
||
IntegrationTestSpec.assertEqualTextFiles(outputFile, expectedFile); | ||
} | ||
|
||
@Test | ||
public void testNoFieldsSpecifiedWithSamples() throws IOException { | ||
final File inputFile = new File(getToolTestDataDir(), "VCFWithGenotypes_1000G.phase3.snippet.vcf"); | ||
final File outputFile = createTempFile("noFieldsSpecifiedWithSamplesOutput", ".table"); | ||
final File expectedFile = new File(getToolTestDataDir(), "expected.noFieldsSpecifiedWithSamples.table"); | ||
|
||
final String[] args = new String[] {"--variant", inputFile.getAbsolutePath(), | ||
"-O", outputFile.getAbsolutePath()}; | ||
runCommandLine(args); | ||
|
||
IntegrationTestSpec.assertEqualTextFiles(outputFile, expectedFile); | ||
} | ||
|
||
@Test | ||
public void testNoFieldsSpecifiedFormatFieldInHeaderNoSamples() throws IOException { | ||
final File inputFile = new File(getToolTestDataDir(), "VCFWithoutGenotypesWithFormatField_dbsnp_138.snippet.vcf"); | ||
final File outputFile = createTempFile("noFieldsSpecifiedNoSamplesOutput", ".table"); | ||
final File expectedFile = new File(getToolTestDataDir(), "expected.noFieldsSpecifiedNoSamples.table"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test case should produce output identical to the first test case above, so you should use the same expected output file as that test uses to make this clear. |
||
|
||
final String[] args = new String[] {"--variant", inputFile.getAbsolutePath(), | ||
"-O", outputFile.getAbsolutePath()}; | ||
runCommandLine(args); | ||
|
||
IntegrationTestSpec.assertEqualTextFiles(outputFile, expectedFile); | ||
} | ||
|
||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After reviewing the changes to the tool, I think you should add a third integration test with a VCF that contains declarations for FORMAT fields in its header, but no samples. This will exercise the following case:
It's a bit weird, but theoretically possible. You should be able to create this by making a copy of your |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the tool documentation at the top of the class, document the fact that if the tool is run without specifying any fields it defaults to including all fields declared in the VCF header. Also include an example command line for that case in the "Usage Example" section.