Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix VCFFuncotation output order bug #6178

Merged
merged 5 commits into from
Oct 4, 2019
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;

/**
* GATKRegistrator registers Serializers for our project. We need a JsonSerializer for the Google Genomics classes
Expand All @@ -52,7 +53,7 @@ public static void registerFuncotationMapDependencies(final Kryo kryo) {
Registration registration = kryo.register(TableFuncotation.class);
registration.setInstantiator(new ObjectInstantiator<TableFuncotation>() {
public TableFuncotation newInstance() {
return TableFuncotation.create(new HashMap<>(), Allele.UNSPECIFIED_ALTERNATE_ALLELE, "TEMP", null);
return TableFuncotation.create(new LinkedHashMap<>(), Allele.UNSPECIFIED_ALTERNATE_ALLELE, "TEMP", null);
}
});
registration = kryo.register(VcfFuncotationMetadata.class);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -192,21 +192,6 @@ public static TableFuncotation create(final LinkedHashSet<String> fieldNames, fi
return create(new ArrayList<>(fieldNames), fieldValues, altAllele, dataSourceName, metadata );
}

/**
* See {@link TableFuncotation#create(List, List, Allele, String, FuncotationMetadata)}
*
* @param data Map for field name to field value. The field value will be converted to a String. Never {@code null}
* @param altAllele See {@link TableFuncotation#create(List, List, Allele, String, FuncotationMetadata)}
* @param dataSourceName See {@link TableFuncotation#create(List, List, Allele, String, FuncotationMetadata)}
* @param metadata See {@link TableFuncotation#create(List, List, Allele, String, FuncotationMetadata)}
* @return See {@link TableFuncotation#create(List, List, Allele, String, FuncotationMetadata)}
*/
public static TableFuncotation create(final Map<String, Object> data, final Allele altAllele, final String dataSourceName, final FuncotationMetadata metadata ) {
final List<String> fieldNames = new ArrayList<>(data.keySet());
final List<String> fieldValues = fieldNames.stream().map(f -> data.get(f).toString()).collect(Collectors.toList());
return create(fieldNames, fieldValues, altAllele, dataSourceName, metadata);
}

/**
* See {@link TableFuncotation#create(List, List, Allele, String, FuncotationMetadata)}
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import org.broadinstitute.hellbender.engine.FeatureDataSource;
import org.broadinstitute.hellbender.engine.FeatureInput;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.funcotator.DataSourceFuncotationFactory;
import org.broadinstitute.hellbender.tools.funcotator.Funcotation;
import org.broadinstitute.hellbender.tools.funcotator.FuncotatorArgumentDefinitions;
Expand All @@ -25,6 +26,7 @@
import java.nio.file.Path;
import java.util.*;
import java.util.function.BiFunction;
import java.util.function.BinaryOperator;
import java.util.stream.Collectors;

/**
Expand Down Expand Up @@ -59,7 +61,7 @@ public class VcfFuncotationFactory extends DataSourceFuncotationFactory {
* The field names that this {@link VcfFuncotationFactory} supports
* and default values for each.
*/
private final LinkedHashMap<String, Object> supportedFieldNamesAndDefaults;
private final LinkedHashMap<String, String> supportedFieldNamesAndDefaults;

/**
* A list of values to use when there are no annotations for an allele.
Expand Down Expand Up @@ -248,14 +250,14 @@ protected List<Funcotation> createDefaultFuncotationsOnVariant( final VariantCon
}
}

@Override
/**
* {@inheritDoc}
*
* This is really the only entry point for the Vcf FuncotationFactory.
*
* {@link VcfFuncotationFactory} can be used with or without Gencode annotations.
*/
@Override
protected List<Funcotation> createFuncotationsOnVariant(final VariantContext variant, final ReferenceContext referenceContext, final List<Feature> featureList) {

final List<Funcotation> outputFuncotations = new ArrayList<>();
Expand Down Expand Up @@ -294,7 +296,7 @@ protected List<Funcotation> createFuncotationsOnVariant(final VariantContext var
final Allele queryAltAllele = variant.getAlternateAllele(i);
if (matchIndex != -1) {

final LinkedHashMap<String, Object> annotations = new LinkedHashMap<>(supportedFieldNamesAndDefaults);
final LinkedHashMap<String, String> annotations = new LinkedHashMap<>(supportedFieldNamesAndDefaults);

for (final Map.Entry<String, Object> entry : featureVariant.getAttributes().entrySet()) {
populateAnnotationMap(featureVariant, variant, matchIndex, annotations, entry);
Expand Down Expand Up @@ -333,10 +335,13 @@ private static Funcotation mergeDuplicateFuncotationFactoryVariant(final Funcota
final LinkedHashSet<String> allFieldNames = funcotation1.getFieldNames();
allFieldNames.addAll(funcotation2.getFieldNames());

final Map<String, Object> mergedFieldsMap = allFieldNames.stream()
.collect(Collectors.toMap(f -> f, f -> mergeFuncotationValue(f, funcotation1, funcotation2, VcfFuncotationFactory::renderFieldConflicts)));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could use the Collectors.toMap() overloaded method that takes a Supplier and pass in LinkedHashMap::new. The default uses HashMap::new.

In this case, using streaming doesn't add much, but for longer chains of stream operations this would be a good change. Might be worth having a utility function in GATK to make it easy.

See https://docs.oracle.com/javase/8/docs/api/java/util/stream/Collectors.html#toMap-java.util.function.Function-java.util.function.Function-java.util.function.BinaryOperator-java.util.function.Supplier-

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, you could do that. You end up with a funny unused merge function if you do because this is assuming that you've pre-uniqued the inputs so I figured I'd just switch it to a for loop.

return TableFuncotation.create(mergedFieldsMap, funcotation1.getAltAllele(), funcotation1.getDataSourceName(),
merge(funcotation1.getMetadata(), funcotation2.getMetadata()));
final LinkedHashMap<String, String> mergedFieldsMap = new LinkedHashMap<>();
for (final String fieldName: allFieldNames){
mergedFieldsMap.put(fieldName, mergeFuncotationValue(fieldName, funcotation1, funcotation2));
}

return TableFuncotation.create(mergedFieldsMap, funcotation1.getAltAllele(), funcotation1.getDataSourceName(),
merge(funcotation1.getMetadata(), funcotation2.getMetadata()));
}

/**
Expand All @@ -357,20 +362,17 @@ private static FuncotationMetadata merge(final FuncotationMetadata funcotationMe
* @param fieldName the annotation to determine.
* @param funcotation1 first region to merge.
* @param funcotation2 second region to merge.
* @param conflictFunction the function to run to solve conflicts.
* @return string with the new, merged value of the annotation. Returns {@code null} if the annotation name
* does not exist in either region.
*/
private static String mergeFuncotationValue(final String fieldName, final Funcotation funcotation1,
final Funcotation funcotation2, final BiFunction<String, String, String> conflictFunction) {
final Funcotation funcotation2) {
final boolean doesRegion1ContainAnnotation = funcotation1.hasField(fieldName);
final boolean doesRegion2ContainAnnotation = funcotation2.hasField(fieldName);

if (doesRegion1ContainAnnotation && doesRegion2ContainAnnotation) {

// Both regions contain an annotation and presumably these are of different values.
return conflictFunction.apply(funcotation1.getField(fieldName),
funcotation2.getField(fieldName));
return VcfFuncotationFactory.renderFieldConflicts(funcotation1.getField(fieldName), funcotation2.getField(fieldName));
} else if (doesRegion1ContainAnnotation) {
return funcotation1.getField(fieldName);
} else if (doesRegion2ContainAnnotation) {
Expand All @@ -384,7 +386,7 @@ private static String renderFieldConflicts(final String value1, final String val
return value1 + DUPLICATE_RECORD_DELIMITER + value2;
}

private void populateAnnotationMap(final VariantContext funcotationFactoryVariant, final VariantContext queryVariant, final int funcotationFactoryAltAlleleIndex, final LinkedHashMap<String, Object> annotations, final Map.Entry<String, Object> attributeEntry) {
private void populateAnnotationMap(final VariantContext funcotationFactoryVariant, final VariantContext queryVariant, final int funcotationFactoryAltAlleleIndex, final LinkedHashMap<String, String> annotations, final Map.Entry<String, Object> attributeEntry) {
final String valueString;
final String attributeName = attributeEntry.getKey();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,7 @@ public void write(final VariantContext variant, final FuncotationMap txToFuncota
Stream.concat(funcotations.stream(), Stream.of(manualAnnotationFuncotation))
.filter(f -> f.getAltAllele().equals(altAllele))
.filter(f -> f.getFieldNames().size() > 0)
.filter(f -> {
return !f.getDataSourceName().equals(FuncotatorConstants.DATASOURCE_NAME_FOR_INPUT_VCFS);
})
.filter(f -> !f.getDataSourceName().equals(FuncotatorConstants.DATASOURCE_NAME_FOR_INPUT_VCFS))
.map(VcfOutputRenderer::adjustIndelAlleleInformation)
.map(f -> FuncotatorUtils.renderSanitizedFuncotationForVcf(f, finalFuncotationFieldNames))
.collect(Collectors.joining(FIELD_DELIMITER))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -888,11 +888,11 @@ public void testGetAlleles(final FuncotationMap funcotationMap, final LinkedHash
@Test
public void testCopy() {
final FuncotationMap old = FuncotationMap.createEmpty();
old.add("TEST_TX", TableFuncotation.create(ImmutableSortedMap.of("f1", "v1", "f2", "v2"),
old.add("TEST_TX", TableFuncotation.create(new LinkedHashMap<>(ImmutableSortedMap.of("f1", "v1", "f2", "v2")),
Allele.create("C"), "TEST_DATA", null));

final FuncotationMap newFuncotationMap = FuncotationMap.create(old);
newFuncotationMap.add("TEST_TX2", TableFuncotation.create(ImmutableSortedMap.of("f3", "v1", "f4", "v2"),
newFuncotationMap.add("TEST_TX2", TableFuncotation.create(new LinkedHashMap<>(ImmutableSortedMap.of("f3", "v1", "f4", "v2")),
Allele.create("C"), "TEST_DATA", null));

Assert.assertEquals(newFuncotationMap.getTranscriptList(), Arrays.asList("TEST_TX", "TEST_TX2"));
Expand All @@ -907,11 +907,11 @@ public void testCopyWithGencode() {
final FuncotationMap old = FuncotationMap.createFromGencodeFuncotations(Collections.singletonList(new GencodeFuncotationBuilder()
.setAnnotationTranscript("TEST_TX")
.build()));
old.add("TEST_TX", TableFuncotation.create(ImmutableSortedMap.of("f1", "v1", "f2", "v2"),
old.add("TEST_TX", TableFuncotation.create(new LinkedHashMap<>(ImmutableSortedMap.of("f1", "v1", "f2", "v2")),
Allele.create("C"), "TEST_DATA", null));

final FuncotationMap newFuncotationMap = FuncotationMap.create(old);
newFuncotationMap.add("TEST_TX2", TableFuncotation.create(ImmutableSortedMap.of("f3", "v1", "f4", "v2"),
newFuncotationMap.add("TEST_TX2", TableFuncotation.create(new LinkedHashMap<>(ImmutableSortedMap.of("f3", "v1", "f4", "v2")),
Allele.create("C"), "TEST_DATA", null));

Assert.assertEquals(newFuncotationMap.getTranscriptList(), Arrays.asList("TEST_TX", "TEST_TX2"));
Expand All @@ -927,7 +927,7 @@ public void testSerializationRoundTrip() {
final FuncotationMap funcotationMap = FuncotationMap.createFromGencodeFuncotations(Collections.singletonList(new GencodeFuncotationBuilder()
.setAnnotationTranscript(FuncotationMap.NO_TRANSCRIPT_AVAILABLE_KEY)
.build()));
funcotationMap.add(FuncotationMap.NO_TRANSCRIPT_AVAILABLE_KEY, TableFuncotation.create(ImmutableSortedMap.of("f1", "v1", "f2", "v2"),
funcotationMap.add(FuncotationMap.NO_TRANSCRIPT_AVAILABLE_KEY, TableFuncotation.create(new LinkedHashMap<>(ImmutableSortedMap.of("f1", "v1", "f2", "v2")),
Allele.create("C"), "TEST_DATA", null));

final Consumer<Kryo> registerFuncotationMapForKryo = GATKRegistrator::registerFuncotationMapDependencies;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,11 @@ public class FuncotatorIntegrationTest extends CommandLineProgramTest {

private static final String XSV_CLINVAR_MULTIHIT_TEST_VCF = toolsTestDir + "funcotator" + File.separator + "clinvar_hg19_multihit_test.vcf";
private static final String FILTER_TEST_VCF = toolsTestDir + "funcotator" + File.separator + "FILTER_test.vcf";
private static final String VCF_FIELD_ORDER_SWAP_TEST_VCF = toolsTestDir + "funcotator" + File.separator + "vcfBugRepro.vcf";
private static final String DS_XSV_CLINVAR_TESTS = largeFileTestDir + "funcotator" + File.separator + "small_ds_clinvar_hg19" + File.separator;
private static final String DS_FILTER_PARSE_TESTS = largeFileTestDir + "funcotator" + File.separator + "small_ds_FILTER_test" + File.separator;

public static final String VCF_FIELD_ORDER_TEST_DATA_SOURCES = largeFileTestDir + "funcotator" + File.separator + "vcfFuncotationOrderingBugRepro" + File.separator;

private static final String NOT_M2_TEST_HG19 = toolsTestDir + "funcotator/NotM2_test_custom_maf_fields.vcf";
private static final String M2_TEST_HG19 = toolsTestDir + "funcotator/M2_test_custom_maf_fields.vcf";
private static final String NOT_M2_TEST_HG19_TUMOR_ONLY = toolsTestDir + "funcotator/NotM2_test_custom_maf_fields_tumor_only.vcf";
Expand Down Expand Up @@ -300,6 +302,14 @@ private Object[][] provideForNonTrivialLargeDataValidationTest() {
FuncotatorTestConstants.FUNCOTATOR_DATA_SOURCES_MAIN_FOLDER,
FuncotatorTestConstants.NON_TRIVIAL_DATA_VALIDATION_TEST_HG19_DATA_SET_2_EXPECTED_OUTPUT
},
{
//This tests https://github.com/broadinstitute/gatk/issues/6173
FuncotatorTestConstants.SINGLE_LINE,
b37Reference,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
FuncotatorTestConstants.FUNCOTATOR_DATA_SOURCES_MAIN_FOLDER,
FuncotatorTestConstants.SINGLE_LINE_EXPECTED
},
{
FuncotatorTestConstants.NON_TRIVIAL_DATA_VALIDATION_TEST_HG38,
hg38Reference,
Expand Down Expand Up @@ -921,6 +931,46 @@ public void testCanAnnotateHg38ClinvarAndGencodeV28() {
.count(), NUM_CLINVAR_HITS);
}

//Test for https://github.com/broadinstitute/gatk/issues/6173
@Test
public void testVCFColumnsArentShuffled() {
final File outputFile = createTempFile("tmpTestFilterParsing", "vcf");

final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator(
VCF_FIELD_ORDER_SWAP_TEST_VCF,
outputFile,
b37Reference,
VCF_FIELD_ORDER_TEST_DATA_SOURCES,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
FuncotatorArgumentDefinitions.OutputFormatType.VCF,
false);

arguments.addBooleanArgument(FuncotatorArgumentDefinitions.FORCE_B37_TO_HG19_REFERENCE_CONTIG_CONVERSION, true);

runCommandLine(arguments);

final Pair<VCFHeader, List<VariantContext>> tempVcf = VariantContextTestUtils.readEntireVCFIntoMemory(outputFile.getAbsolutePath());
Assert.assertEquals( tempVcf.getRight().size(), 1 );

final String[] funcotatorKeys = FuncotatorUtils.extractFuncotatorKeysFromHeaderDescription(tempVcf.getLeft().getInfoHeaderLine(VcfOutputRenderer.FUNCOTATOR_VCF_FIELD_NAME).getDescription());

final VariantContext variantContext = tempVcf.getRight().get(0);
final Map<Allele, FuncotationMap> funcs = FuncotatorUtils.createAlleleToFuncotationMapFromFuncotationVcfAttribute(
funcotatorKeys, variantContext, "Gencode_19_annotationTranscript", "FAKE_SOURCE");

Allele allele = variantContext.getAlternateAllele(0);
final String txId = funcs.get(allele).getTranscriptList().get(0);
Assert.assertEquals( funcs.get(allele).get(txId).size(), 1 );

final Funcotation funcotation = funcs.get(allele).get(txId).get(0);

//Assert that the value of the field F# is F#|F# encoded in Funcotator's percent encoding scheme
for(int i = 1; i <= 9; i++){
Assert.assertEquals(funcotation.getField("dbSnp_F"+i), "F"+i+"_%7C_"+"F"+i);
}
}


@Test
public void testFilterParsing() {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,8 @@ public class FuncotatorTestConstants {
public static final String NON_TRIVIAL_DATA_VALIDATION_TEST_HG19_DATA_SET_1 = VALIDATION_TEST_DATA_DIR + "regressionTestVariantSet1.vcf";
public static final String NON_TRIVIAL_DATA_VALIDATION_TEST_HG19_DATA_SET_1_EXPECTED_OUTPUT = VALIDATION_TEST_DATA_DIR + "regressionTestVariantSet1_expected.vcf";
public static final String NON_TRIVIAL_DATA_VALIDATION_TEST_HG19_DATA_SET_2 = VALIDATION_TEST_DATA_DIR + "regressionTestVariantSet2.vcf";
public static final String SINGLE_LINE = VALIDATION_TEST_DATA_DIR + "hashSetOrderingIssue.vcf";
public static final String SINGLE_LINE_EXPECTED = VALIDATION_TEST_DATA_DIR + "hashSetOrderingIssue_expected.vcf";
public static final String NON_TRIVIAL_DATA_VALIDATION_TEST_HG19_DATA_SET_2_EXPECTED_OUTPUT = VALIDATION_TEST_DATA_DIR + "regressionTestVariantSet2_expected.vcf";
public static final String NON_TRIVIAL_DATA_VALIDATION_TEST_HG38 = VALIDATION_TEST_DATA_DIR + "regressionTestVariantSetHG38.vcf";
public static final String NON_TRIVIAL_DATA_VALIDATION_TEST_EXPECTED_OUTPUT = VALIDATION_TEST_DATA_DIR + "regressionTestVariantSetHG38_expected.vcf";
Expand Down
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Loading