broadinstitute · droazen · Jun 30, 2022 · Jun 17, 2022 · Jun 17, 2022 · Jun 22, 2022
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable.java
@@ -2,17 +2,14 @@
 
 import htsjdk.variant.variantcontext.Allele;
 import htsjdk.variant.variantcontext.VariantContext;
-import htsjdk.variant.vcf.VCFConstants;
-import htsjdk.variant.vcf.VCFHeader;
-import htsjdk.variant.vcf.VCFHeaderLineCount;
+import htsjdk.variant.vcf.*;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.broadinstitute.barclay.argparser.Advanced;
 import org.broadinstitute.barclay.argparser.Argument;
 import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
 import org.broadinstitute.barclay.help.DocumentedFeature;
 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
-import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
 import picard.cmdline.programgroups.VariantEvaluationProgramGroup;
 import org.broadinstitute.hellbender.engine.FeatureContext;
 import org.broadinstitute.hellbender.engine.ReadsContext;
@@ -212,9 +209,39 @@ public void onTraversalStart() {
         inputHeader = getHeaderForVariants();
         outputStream = createPrintStream();
 
-        if (genotypeFieldsToTake.isEmpty() && asGenotypeFieldsToTake.isEmpty()) {
-            samples = Collections.emptySortedSet();
-        } else {
+        // if no fields specified, default to include all fields listed in header into table
+        if(fieldsToTake.isEmpty() && genotypeFieldsToTake.isEmpty() && asFieldsToTake.isEmpty() && asGenotypeFieldsToTake.isEmpty()){
+            logger.warn("No fields were specified. All fields will be included in output table.");
+
+            // add all mandatory VCF fields (except INFO)
+            for(VCFHeader.HEADER_FIELDS headerField : VCFHeader.HEADER_FIELDS.values()){
+                if(!headerField.name().equals(VCFHeader.HEADER_FIELDS.INFO.name())) {
+                    fieldsToTake.add(headerField.name());
+                }
+            }
+
+            // add all INFO fields present in VCF header
+            for (final VCFInfoHeaderLine infoLine : inputHeader.getInfoHeaderLines()) {
+                fieldsToTake.add(infoLine.getID());
+            }
+
+            // add all FORMAT fields present in VCF header
+            for (final VCFFormatHeaderLine formatLine : inputHeader.getFormatHeaderLines()) {
+                // ensure GT field listed as first FORMAT field
+                if(formatLine.getID().equals("GT")) {
+                    genotypeFieldsToTake.add(0, formatLine.getID());
+                }
+                else {
+                    genotypeFieldsToTake.add(formatLine.getID());
+                }
+            }
+        }
+
+        // if fields specified, but none are genotype fields, set samples to empty
+        if (genotypeFieldsToTake.isEmpty() && asGenotypeFieldsToTake.isEmpty() && (!fieldsToTake.isEmpty() || !asFieldsToTake.isEmpty())) {
+                samples = Collections.emptySortedSet();
+        }
+        else {
             final Map<String, VCFHeader> vcfHeaders = Collections.singletonMap(getDrivingVariantsFeatureInput().getName(), getHeaderForVariants());
             samples = VcfUtils.getSortedSampleSet(vcfHeaders, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
 
@@ -238,6 +265,7 @@ public void onTraversalStart() {
             outputStream.println("RecordID\tSample\tVariable\tValue");
         } else {
             final List<String> fields = new ArrayList<>();
+
             fields.addAll(fieldsToTake);
             fields.addAll(asFieldsToTake);
             fields.addAll(createGenotypeFields());

diff --git a/.../broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTableIntegrationTest.java b/.../broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTableIntegrationTest.java
@@ -5,6 +5,7 @@
 import org.broadinstitute.hellbender.testutils.IntegrationTestSpec;
 import org.testng.annotations.Test;
 
+import java.io.File;
 import java.io.IOException;
 import java.util.Arrays;
 
@@ -236,4 +237,31 @@ public void testMoltenOutputWithMultipleAlleles() throws IOException {
         spec.setTrimWhiteSpace(false);
         spec.executeTest("testMoltenOutputWithMultipleAlleles", this);
     }
+
+    @Test
+    public void testNoFieldsSpecified() throws IOException {
+        final File inputFile = new File(getToolTestDataDir(), "extraheaderlinesdeleted_dbsnp_138.snippet.vcf");
+        final File outputFile = createTempFile(getToolTestDataDir(), "noFieldsSpecifiedOutput.table");
+        final File expectedFile = new File(getToolTestDataDir(), "expected.noFieldsSpecified.table");
+
+        final String[] args = new String[] {"--variant", inputFile.getAbsolutePath(),
+                "-O", outputFile.getAbsolutePath()};
+        runCommandLine(args);
+
+        IntegrationTestSpec.assertEqualTextFiles(outputFile, expectedFile);
+    }
+
+    @Test
+    public void testNoFieldsSpecifiedWithSamples() throws IOException {
+        final File inputFile = new File(getToolTestDataDir(), "1000G.phase3.snippet.vcf");
+        final File outputFile = createTempFile(getToolTestDataDir(), "noFieldsSpecifiedWithSamplesOutput.table");
+        final File expectedFile = new File(getToolTestDataDir(), "expected.noFieldsSpecifiedWithSamples.table");
+
+        final String[] args = new String[] {"--variant", inputFile.getAbsolutePath(),
+                "-O", outputFile.getAbsolutePath()};
+        runCommandLine(args);
+
+        IntegrationTestSpec.assertEqualTextFiles(outputFile, expectedFile);
+    }
+
 }
diff --git a/...dinstitute/hellbender/tools/walkers/variantutils/VariantsToTable/1000G.phase3.snippet.vcf b/...dinstitute/hellbender/tools/walkers/variantutils/VariantsToTable/1000G.phase3.snippet.vcf
@@ -0,0 +1,48 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=LowQual,Description="Low quality">
+##FILTER=<ID=VQSRTrancheINDEL99.00to99.50,Description="Truth sensitivity tranche level for INDEL model at VQS Lod: -1.0597 <= x < 0.1687">
+##FILTER=<ID=VQSRTrancheINDEL99.50to99.90,Description="Truth sensitivity tranche level for INDEL model at VQS Lod: -5.8589 <= x < -1.0597">
+##FILTER=<ID=VQSRTrancheINDEL99.90to99.95,Description="Truth sensitivity tranche level for INDEL model at VQS Lod: -6.5565 <= x < -5.8589">
+##FILTER=<ID=VQSRTrancheINDEL99.95to100.00+,Description="Truth sensitivity tranche level for INDEL model at VQS Lod < -70.77">
+##FILTER=<ID=VQSRTrancheINDEL99.95to100.00,Description="Truth sensitivity tranche level for INDEL model at VQS Lod: -70.77 <= x < -6.5565">
+##FILTER=<ID=VQSRTrancheSNP99.90to99.95,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -96.2455 <= x < -14.2763">
+##FILTER=<ID=VQSRTrancheSNP99.95to100.00+,Description="Truth sensitivity tranche level for SNP model at VQS Lod < -437.9355">
+##FILTER=<ID=VQSRTrancheSNP99.95to100.00,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -437.9355 <= x < -96.2455">
+##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
+##GATKCommandLine=<ID=ApplyRecalibration,Version=2.6-20-g0728857,Date="Mon Jul 01 11:58:36 EDT 2013",Epoch=1372694316825,CommandLineOptions="analysis_type=ApplyRecalibration input_file=[] read_buffer_size=null phone_home=AWS gatk_key=null tag=NA read_filter=[] intervals=[20] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/humgen/1kg/reference/human_g1k_v37.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false useOriginalQualities=false defaultBaseQualities=-1 performanceLog=null BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 allow_bqsr_on_reduced_bams_despite_repeated_warnings=false validation_strictness=SILENT remove_program_records=false keep_program_records=false unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=false num_threads=5 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false version=false input=[(RodBinding name=input source=/humgen/1kg/processing/production_wgs_final/chr20/ALL.chr20.lowpass.hc.recal.vcf)] recal_file=(RodBinding name=recal_file source=/humgen/1kg/processing/production_wgs_final/chr20/ALL.chr20.lowpass.hc.recal.recal) tranches_file=/humgen/1kg/processing/production_wgs_final/chr20/ALL.chr20.lowpass.hc.recal.tranches out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub ts_filter_level=99.0 ignore_filter=null mode=INDEL filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false">
+##GATKCommandLine=<ID=SelectVariants,CommandLine="SelectVariants --output src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable/1000G.phase3.snippet.vcf --sample-name HG00096 --sample-name HG00097 --sample-name HG00099 --variant src/test/resources/large/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf --intervals 20:10000054-10000117 --invertSelect false --exclude-non-variants false --exclude-filtered false --preserve-alleles false --remove-unused-alternates false --restrict-alleles-to ALL --keep-original-ac false --keep-original-dp false --mendelian-violation false --invert-mendelian-violation false --mendelian-violation-qual-threshold 0.0 --select-random-fraction 0.0 --remove-fraction-genotypes 0.0 --fully-decode false --max-indel-size 2147483647 --min-indel-size 0 --max-filtered-genotypes 2147483647 --min-filtered-genotypes 0 --max-fraction-filtered-genotypes 1.0 --min-fraction-filtered-genotypes 0.0 --max-nocall-number 2147483647 --max-nocall-fraction 1.0 --set-filtered-gt-to-nocall false --allow-nonoverlapping-command-line-samples false --suppress-reference-path false --genomicsdb-max-alternate-alleles 50 --call-genotypes false --genomicsdb-use-bcf-codec false --genomicsdb-shared-posixfs-optimizations false --genomicsdb-use-gcs-hdfs-connector false --interval-set-rule UNION --interval-padding 0 --interval-exclusion-padding 0 --interval-merging-rule ALL --read-validation-stringency SILENT --seconds-between-progress-updates 10.0 --disable-sequence-dictionary-validation false --create-output-bam-index true --create-output-bam-md5 false --create-output-variant-index true --create-output-variant-md5 false --max-variants-per-shard 0 --lenient false --add-output-sam-program-record true --add-output-vcf-command-line true --cloud-prefetch-buffer 40 --cloud-index-prefetch-buffer -1 --disable-bam-index-caching false --sites-only-vcf-output false --help false --version false --showHidden false --verbosity INFO --QUIET false --use-jdk-deflater false --use-jdk-inflater false --gcs-max-retries 20 --gcs-project-for-requester-pays  --disable-tool-default-read-filters false",Version="4.2.6.1-22-gd4f083d-SNAPSHOT",Date="June 21, 2022 1:46:50 PM EDT">
+##GATKVersion=2.5-191-g02f8427
+##HaplotypeCaller="analysis_type=HaplotypeCaller input_file=[/humgen/1kg/processing/production_wgs_final/chr20/ALL.chr20.bam.list] read_buffer_size=null phone_home=AWS gatk_key=null tag=NA read_filter=[] intervals=[/humgen/1kg/processing/production_wgs_final/chr20/.queue/scatterGather/call.for.1000G-1-sg/temp_0001_of_1000/scatter.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/humgen/1kg/reference/human_g1k_v37_decoy.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=200 baq=OFF baqGapOpenPenalty=40.0 fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false performanceLog=null useOriginalQualities=false BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 allow_bqsr_on_reduced_bams_despite_repeated_warnings=false defaultBaseQualities=-1 validation_strictness=SILENT remove_program_records=false keep_program_records=false unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=false num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false version=false out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub graphOutput=null bamOutput=null bam_compression=null disable_bam_indexing=null generate_md5=null simplifyBAM=null bamWriterType=CALLED_HAPLOTYPES dbsnp=(RodBinding name= source=UNBOUND) comp=[] annotation=[ClippingRankSumTest, DepthPerSampleHC] excludeAnnotation=[SpanningDeletions, TandemRepeatAnnotator] heterozygosity=0.001 indel_heterozygosity=1.25E-4 genotyping_mode=DISCOVERY output_mode=EMIT_VARIANTS_ONLY standard_min_confidence_threshold_for_calling=10.0 standard_min_confidence_threshold_for_emitting=10.0 alleles=(RodBinding name= source=UNBOUND) max_alternate_alleles=6 input_prior=[] contamination_fraction_to_filter=0.05 contamination_fraction_per_sample_file=null p_nonref_model=EXACT_INDEPENDENT exactcallslog=null useDebruijnAssembler=false minKmerForDebruijnAssembler=11 onlyUseKmerSizeForDebruijnAssembler=-1 kmerSize=[10, 25] dontIncreaseKmerSizesForCycles=false numPruningSamples=3 maxPathsPerSample=8 dontRecoverDanglingTails=false minPruning=2 gcpHMM=10 includeUmappedReads=false useAllelesTrigger=false useFilteredReadsForAnnotations=false phredScaledGlobalReadMismappingRate=45 maxNumHaplotypesInPopulation=25 mergeVariantsViaLD=false pair_hmm_implementation=LOGLESS_CACHING keepRG=null justDetermineActiveRegions=false dontGenotype=false errorCorrectKmers=false debug=false debugGraphTransformations=false useLowQualityBasesForAssembly=false dontTrimActiveRegions=false dontUseSoftClippedBases=false captureAssemblyFailureBAM=false allowCyclesInKmerGraphToGeneratePaths=false errorCorrectReads=false kmerLengthForReadErrorCorrection=25 minObservationsForKmerToBeSolid=20 activityProfileOut=null activeRegionOut=null activeRegionIn=null activeRegionExtension=null forceActive=false activeRegionMaxSize=null bandPassSigma=null min_mapping_quality_score=20 filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false"
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
+##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
+##INFO=<ID=ClippingRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
+##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
+##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
+##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
+##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
+##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
+##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
+##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
+##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
+##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
+##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
+##INFO=<ID=NEGATIVE_TRAIN_SITE,Number=0,Type=Flag,Description="This variant was used to build the negative training set of bad variants">
+##INFO=<ID=POSITIVE_TRAIN_SITE,Number=0,Type=Flag,Description="This variant was used to build the positive training set of good variants">
+##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
+##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
+##INFO=<ID=VQSLOD,Number=1,Type=Float,Description="Log odds ratio of being a true variant versus being false under the trained gaussian mixture model">
+##INFO=<ID=culprit,Number=1,Type=String,Description="The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out">
+##contig=<ID=20,length=63025520>
+##contig=<ID=21,length=48129895>
+##source=SelectVariants
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	HG00096	HG00097	HG00099
+20	10000054	.	CTTTG	C	504.42	PASS	AC=0;AF=0.00;AN=6;BaseQRankSum=-0.975;ClippingRankSum=-2.925;DP=22;FS=1.899;InbreedingCoeff=0.0592;MQ=59.27;MQ0=0;MQRankSum=-3.212;QD=2.43;ReadPosRankSum=-0.264;VQSLOD=5.10;culprit=FS	GT:AD:DP:GQ:PL	0/0:2,0:2:6:0,6,119	0/0:10,0:10:29:0,29,592	0/0:10,0:10:30:0,30,598
+20	10000107	.	T	C	263.95	PASS	AC=0;AF=0.00;AN=6;BaseQRankSum=-0.444;ClippingRankSum=-3.132;DP=25;FS=0.948;InbreedingCoeff=-0.0102;MQ=59.19;MQ0=0;MQRankSum=2.292;POSITIVE_TRAIN_SITE;QD=10.56;ReadPosRankSum=0.055;VQSLOD=7.76;culprit=FS	GT:AD:DP:GQ:PL	0/0:5,0:5:15:0,15,387	0/0:13,0:13:42:0,42,786	0/0:7,0:7:24:0,24,548
+20	10000117	.	C	T	329458.17	PASS	AC=1;AF=0.167;AN=6;BaseQRankSum=10.505;ClippingRankSum=-20.658;DP=28;FS=8.305;InbreedingCoeff=0.1727;MQ=59.17;MQ0=0;MQRankSum=2.689;POSITIVE_TRAIN_SITE;QD=25.46;ReadPosRankSum=-4.688;VQSLOD=3.19;culprit=ReadPosRankSum	GT:AD:DP:GQ:PL	0/0:5,0:5:15:0,15,189	0/1:8,8:16:99:254,0,231	0/0:7,0:7:21:0,21,271
diff --git a/...te/hellbender/tools/walkers/variantutils/VariantsToTable/expected.noFieldsSpecified.table b/...te/hellbender/tools/walkers/variantutils/VariantsToTable/expected.noFieldsSpecified.table
@@ -0,0 +1,4 @@
+CHROM	POS	ID	REF	ALT	QUAL	FILTER	AC	CAF	CLNDSDB	COMMON	DP	DSS	HD	KGPROD	KGPhase1	LSD	REF	RS	RSPOS	SAO	SSR	VC	VP	WGT	dbSNPBuildID
+20	10000092	rs183046704	T	G	-10.0	PASS	NA	[0.9991,0.0009183]	NA	0	NA	NA	NA	true	true	NA	T	183046704	10000092	0	0	SNV	0x050000000001000014000100	1	135
+20	10000107	rs188245245	T	C	-10.0	PASS	NA	[0.9982,0.001837]	NA	0	NA	NA	NA	true	true	NA	T	188245245	10000107	0	0	SNV	0x050000000001000014000100	1	135
+20	10000117	rs4816203	C	T	-10.0	PASS	NA	[0.3682,0.6318]	NA	1	NA	NA	NA	true	true	NA	C	4816203	10000117	0	0	SNV	0x05010000000117011e000100	1	111