Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make M2 haplotype and clustered events filters smarter about germline events #8717

Merged
merged 13 commits into from
Mar 25, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.VCFConstants;
import org.apache.commons.collections4.ListUtils;
import org.apache.commons.lang3.mutable.MutableInt;
import org.apache.commons.math3.linear.Array2DRowRealMatrix;
import org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor;
import org.apache.commons.math3.linear.RealMatrix;
Expand Down Expand Up @@ -113,12 +114,14 @@ public CalledHaplotypes callMutations(
}
final AlleleLikelihoods<Fragment, Haplotype> logFragmentLikelihoods = logReadLikelihoods.groupEvidence(MTAC.independentMates ? read -> read : GATKRead::getName, Fragment::createAndAvoidFailure);

final Set<Event> potentialSomaticEventsInRegion = new HashSet<>();
for( final int loc : eventStarts ) {
final List<VariantContext> eventsAtThisLoc = AssemblyBasedCallerUtils.getVariantsFromActiveHaplotypes(loc, haplotypes, false);
VariantContext mergedVC = AssemblyBasedCallerUtils.makeMergedVariantContext(eventsAtThisLoc);
if( mergedVC == null ) {
VariantContext merged = AssemblyBasedCallerUtils.makeMergedVariantContext(eventsAtThisLoc);
davidbenjamin marked this conversation as resolved.
Show resolved Hide resolved
if( merged == null ) {
continue;
}
final VariantContext mergedVC = emitRefConf ? ReferenceConfidenceUtils.addNonRefSymbolicAllele(merged) : merged;

// converting haplotype likelihoods to allele likelihoods
final Map<Allele, List<Haplotype>> alleleMapper = AssemblyBasedCallerUtils.createAlleleMapper(mergedVC, loc, haplotypes, true);
Expand All @@ -127,7 +130,6 @@ public CalledHaplotypes callMutations(
logLikelihoods.retainEvidence(variantCallingRelevantFragmentOverlap::overlaps);

if (emitRefConf) {
mergedVC = ReferenceConfidenceUtils.addNonRefSymbolicAllele(mergedVC);
logLikelihoods.addNonReferenceAllele(Allele.NON_REF_ALLELE);
}
final List<LikelihoodMatrix<Fragment, Allele>> tumorMatrices = IntStream.range(0, logLikelihoods.numberOfSamples())
Expand All @@ -152,13 +154,21 @@ public CalledHaplotypes callMutations(
.filter(allele -> forcedAlleles.contains(allele) || tumorLogOdds.getAlt(allele) > MTAC.getEmissionLogOdds())
.collect(Collectors.toList());

final long somaticAltCount = tumorAltAlleles.stream()
final List<Allele> allelesToGenotype = tumorAltAlleles.stream()
.filter(allele -> forcedAlleles.contains(allele) || !hasNormal || MTAC.genotypeGermlineSites || normalLogOdds.getAlt(allele) > MathUtils.log10ToLog(MTAC.normalLog10Odds))
.count();
.toList();

// record somatic alleles for later use in the Event Count annotation
// note that in tumor-only calling it does not attempt to detect germline events
mergedVC.getAlternateAlleles().stream()
.filter(allele -> tumorLogOdds.getAlt(allele) > MTAC.getEmissionLogOdds())
.filter(allele -> !hasNormal || normalLogOdds.getAlt(allele) > MathUtils.log10ToLog(MTAC.normalLog10Odds))
.map(allele -> new Event(mergedVC.getContig(), mergedVC.getStart(), mergedVC.getReference(), allele))
.forEach(potentialSomaticEventsInRegion::add);

// if every alt allele is germline, skip this variant. However, if some alt alleles are germline and others
// are not we emit them all so that the filtering engine can see them
if (somaticAltCount == 0) {
if (allelesToGenotype.isEmpty()) {
continue;
}

Expand Down Expand Up @@ -222,8 +232,39 @@ public CalledHaplotypes callMutations(

final List<VariantContext> outputCalls = AssemblyBasedCallerUtils.phaseCalls(returnCalls, calledHaplotypes);
final int eventCount = outputCalls.size();

// calculate the number of somatic events in the best haplotype of each called variant
final Map<Haplotype, MutableInt> haplotypeSupportCounts = logReadLikelihoods.alleles().stream()
.collect(Collectors.toMap(hap -> hap, label -> new MutableInt(0)));
logReadLikelihoods.bestAllelesBreakingTies()
.forEach(bestHaplotype -> haplotypeSupportCounts.get(bestHaplotype.allele).increment());

final Map<Event, List<Haplotype>> haplotypesByEvent= new HashMap<>();
for (final Haplotype haplotype : logReadLikelihoods.alleles()) {
for (final Event event : haplotype.getEventMap().getEvents()) {
haplotypesByEvent.computeIfAbsent(event, e -> new ArrayList<>()).add(haplotype);
}
}
final Map<VariantContext, List<Integer>> eventCountAnnotations = new HashMap<>();
for (final VariantContext outputCall : outputCalls) {
for (final Allele allele : outputCall.getAlternateAlleles()) {
// note: this creates the minimal representation behind the scenes
final Event event = new Event(outputCall.getContig(), outputCall.getStart(), outputCall.getReference(), allele);
// haplotypesByEvent contains every *assembled* event, including events injected into the original assembly,
// but there are some modes where we genotype events that were never in an assembly graph, in which case
// this annotation is irrelevant
if (haplotypesByEvent.containsKey(event)) {
final Haplotype bestHaplotype = haplotypesByEvent.get(event).stream()
.sorted(Comparator.comparingInt(h -> haplotypeSupportCounts.getOrDefault(h, new MutableInt(0)).intValue()).reversed())
.findFirst().get();

eventCountAnnotations.computeIfAbsent(outputCall, vc -> new ArrayList<>())
.add((int) bestHaplotype.getEventMap().getEvents().stream().filter(potentialSomaticEventsInRegion::contains).count());
}
}
}
final List<VariantContext> outputCallsWithEventCountAnnotation = outputCalls.stream()
.map(vc -> new VariantContextBuilder(vc).attribute(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, eventCount).make())
.map(vc -> new VariantContextBuilder(vc).attribute(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, eventCountAnnotations.get(vc)).make())
.collect(Collectors.toList());
return new CalledHaplotypes(outputCallsWithEventCountAnnotation, calledHaplotypes);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ public ClusteredEventsFilter(final int maxEventsInRegion) {

@Override
public boolean isArtifact(final VariantContext vc, final Mutect2FilteringEngine filteringEngine) {
final Integer eventCount = vc.getAttributeAsInt(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, -1);
return eventCount > maxEventsInRegion;
final List<Integer> eventCounts = vc.getAttributeAsIntList(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, 0);
return eventCounts.stream().mapToInt(n -> n).max().getAsInt() > maxEventsInRegion;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import java.util.*;

public class FilteredHaplotypeFilter extends Mutect2VariantFilter {
private static final double GERMLINE_PROBABILITY_TO_IGNORE_NORMAL_ARTIFACT = 0.25;
private final double maxIntraHaplotypeDistance;

// for each pgt + pid phasing string, a list of loci-error probability pairs
Expand Down Expand Up @@ -54,10 +55,21 @@ public double calculateErrorProbability(final VariantContext vc, final Mutect2Fi

@Override
protected void accumulateDataForLearning(final VariantContext vc, final ErrorProbabilities errorProbabilities, final Mutect2FilteringEngine filteringEngine) {
// we record the maximum non-sequencing artifact that is not this filter itself
final double artifactProbability = errorProbabilities.getProbabilitiesByFilter().entrySet().stream()
.filter(e -> e.getKey().errorType() != ErrorType.SEQUENCING)
.filter(e -> !e.getKey().filterName().equals(filterName()))
// we record the maximum non-sequencing, non-germline, artifact probability that is not from this filter itself
final Map<Mutect2Filter, List<Double>> probabilitiesByFilter = errorProbabilities.getProbabilitiesByFilter();

final double germlineProbability = probabilitiesByFilter.entrySet().stream()
.filter(e -> e.getKey().filterName() == GATKVCFConstants.GERMLINE_RISK_FILTER_NAME)
.flatMap(e -> e.getValue().stream()) // the value is a list of double, we need the max of all the lists
.max(Double::compareTo).orElse(0.0);

// the normal artifact filter often lights up when there's a non-artifactual germline event, which we don't want here
final boolean ignoreNormalArtifact = germlineProbability > GERMLINE_PROBABILITY_TO_IGNORE_NORMAL_ARTIFACT;

final double artifactProbability = probabilitiesByFilter.entrySet().stream()
.filter(e -> e.getKey().errorType() == ErrorType.ARTIFACT)
.filter(e -> !(ignoreNormalArtifact && e.getKey().filterName() == GATKVCFConstants.ARTIFACT_IN_NORMAL_FILTER_NAME))
.filter(e -> !e.getKey().filterName().equals(filterName())) // exclude the haplotype filter itself, which would be circular
.flatMap(e -> e.getValue().stream()) // the value is a list of double, we need the max of all the lists
.max(Double::compareTo).orElse(0.0);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ public static VCFFormatHeaderLine getEquivalentFormatHeaderLine(final String inf
addInfoLine(new VCFInfoHeaderLine(TREE_SCORE, 1, VCFHeaderLineType.Float, "Score from single sample filtering with random forest model."));

// M2-related info lines
addInfoLine(new VCFInfoHeaderLine(EVENT_COUNT_IN_HAPLOTYPE_KEY, 1, VCFHeaderLineType.Integer, "Number of events in this haplotype"));
addInfoLine(new VCFInfoHeaderLine(EVENT_COUNT_IN_HAPLOTYPE_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Number of somatic events in best supporting haplotype for each alt allele"));
addInfoLine(new VCFInfoHeaderLine(NORMAL_LOG_10_ODDS_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Normal log 10 likelihood ratio of diploid het or hom alt genotypes"));
addInfoLine(new VCFInfoHeaderLine(TUMOR_LOG_10_ODDS_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Log 10 likelihood ratio score of variant existing versus not existing"));
addFormatLine(new VCFFormatHeaderLine(TUMOR_LOG_10_ODDS_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Log 10 likelihood ratio score of variant existing versus not existing"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFUtils;
import org.apache.commons.collections4.SetUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.hellbender.CommandLineProgramTest;
Expand Down Expand Up @@ -102,14 +103,14 @@ public class Mutect2IntegrationTest extends CommandLineProgramTest {
@DataProvider(name = "dreamSyntheticData")
public Object[][] dreamSyntheticData() {
return new Object[][]{
{DREAM_1_TUMOR, Optional.of(DREAM_1_NORMAL), DREAM_1_TRUTH, DREAM_1_MASK, 0.97, false},
{DREAM_2_TUMOR, Optional.of(DREAM_2_NORMAL), DREAM_2_TRUTH, DREAM_2_MASK, 0.95, false},
{DREAM_2_TUMOR, Optional.empty(), DREAM_2_TRUTH, DREAM_2_MASK, 0.95, false},
{DREAM_2_TUMOR, Optional.empty(), DREAM_2_TRUTH, DREAM_2_MASK, 0.95, true},
{DREAM_3_TUMOR, Optional.of(DREAM_3_NORMAL), DREAM_3_TRUTH, DREAM_3_MASK, 0.90, false},
{DREAM_4_TUMOR, Optional.of(DREAM_4_NORMAL), DREAM_4_TRUTH, DREAM_4_MASK, 0.65, false},
{DREAM_4_TUMOR, Optional.of(DREAM_4_NORMAL), DREAM_4_TRUTH, DREAM_4_MASK, 0.65, true},
{DREAM_4_TUMOR, Optional.empty(), DREAM_4_TRUTH, DREAM_4_MASK, 0.65, false},
{DREAM_1_TUMOR, Optional.of(DREAM_1_NORMAL), DREAM_1_TRUTH, DREAM_1_MASK, 0.98, false},
{DREAM_2_TUMOR, Optional.of(DREAM_2_NORMAL), DREAM_2_TRUTH, DREAM_2_MASK, 0.98, false},
{DREAM_2_TUMOR, Optional.empty(), DREAM_2_TRUTH, DREAM_2_MASK, 0.98, false},
{DREAM_2_TUMOR, Optional.empty(), DREAM_2_TRUTH, DREAM_2_MASK, 0.98, true},
{DREAM_3_TUMOR, Optional.of(DREAM_3_NORMAL), DREAM_3_TRUTH, DREAM_3_MASK, 0.95, false},
{DREAM_4_TUMOR, Optional.of(DREAM_4_NORMAL), DREAM_4_TRUTH, DREAM_4_MASK, 0.8, false},
{DREAM_4_TUMOR, Optional.of(DREAM_4_NORMAL), DREAM_4_TRUTH, DREAM_4_MASK, 0.7, true},
{DREAM_4_TUMOR, Optional.empty(), DREAM_4_TRUTH, DREAM_4_MASK, 0.7, false},
};
}

Expand Down Expand Up @@ -417,6 +418,29 @@ public void testForceCalling() {
}
}

// regression test for PR: https://github.com/broadinstitute/gatk/pull/8717, which fixed an issue wherein germline events were
// incorrectly contributing to the bad haplotype and clustered events filters. In order to make the test more stringent we turn
// on the genotype-germline-sites flag
// The test is on two variants that caused particular trouble previously in the DREAM 2 sample
@Test
public void testFilteredHaplotypeAndClusteredEventsFilters() throws Exception {
Utils.resetRandomGenerator();
final File tumor = DREAM_2_TUMOR;
final File normal = DREAM_2_NORMAL;
final File unfilteredVcf = createTempFile("unfiltered", ".vcf");
final File filteredVcf = createTempFile("filtered", ".vcf");

for (final int locus : List.of(4567628, 20870771)) {
final String interval = "20:" + (locus - 500) + "-" + (locus + 500);
runMutect2(List.of(tumor), List.of(normal), unfilteredVcf, interval, b37Reference, Optional.of(GNOMAD),
args -> args.addFlag(M2ArgumentCollection.GENOTYPE_GERMLINE_SITES_LONG_NAME));
runFilterMutectCalls(unfilteredVcf, filteredVcf, b37Reference);
final Optional<VariantContext> vcShouldPass = VariantContextTestUtils.streamVcf(filteredVcf).filter(vc -> vc.getStart() == locus).findFirst();
Assert.assertTrue(vcShouldPass.isPresent());
Assert.assertFalse(vcShouldPass.get().isFiltered());
}
}

// test that the dont-use-soft-clips option actually does something
@Test
public void testDontUseSoftClips() {
Expand Down Expand Up @@ -528,7 +552,7 @@ public void testContaminationFilter() {
filteredVariants.get(10).stream().filter(vc -> vc.getFilters().contains(GATKVCFConstants.CONTAMINATION_FILTER_NAME)).count());

final List<VariantContext> missedObviousVariantsAtTenPercent = filteredVariants.get(10).stream()
.filter(vc -> !vc.getFilters().contains(GATKVCFConstants.CONTAMINATION_FILTER_NAME))
.filter(vc -> !vc.isFiltered())
.filter(VariantContext::isBiallelic)
.filter(vc -> {
final int[] AD = vc.getGenotype(0).getAD();
Expand Down Expand Up @@ -1088,7 +1112,18 @@ public void testExposureOfSmithWatermanParametersIsConsistentWithPastResults(fin
};

runCommandLine(args);
IntegrationTestSpec.assertEqualTextFiles(output, expected);

// This used to be an exact text match, which cause hours of aggravation when the test passed locally and
// failed on the cloud.
final double[] outputTlods = VariantContextTestUtils.streamVcf(output)
.flatMap(vc -> vc.getAttributeAsDoubleList(GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, 0).stream())
.mapToDouble(x -> x).toArray();

final double[] expectedTlods = VariantContextTestUtils.streamVcf(expected)
.flatMap(vc -> vc.getAttributeAsDoubleList(GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, 0).stream())
.mapToDouble(x -> x).toArray();

Assert.assertEquals(outputTlods, expectedTlods);
davidbenjamin marked this conversation as resolved.
Show resolved Hide resolved
}

@SafeVarargs
Expand Down
Loading
Loading