-
Notifications
You must be signed in to change notification settings - Fork 594
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Checkpoint of necessary changes for DRAGEN v3.7.8 concordance #8083
Changes from all commits
2cfdfdb
900900c
77d4216
68f6cfa
8d3d123
a690509
1fec7b8
c878ee5
5c2b77a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,8 +31,11 @@ | |
import org.broadinstitute.hellbender.utils.genotyper.SampleList; | ||
import org.broadinstitute.hellbender.utils.haplotype.Haplotype; | ||
import org.broadinstitute.hellbender.utils.haplotype.HaplotypeBAMWriter; | ||
import org.broadinstitute.hellbender.utils.haplotype.PartiallyDeterminedHaplotype; | ||
import org.broadinstitute.hellbender.utils.io.IOUtils; | ||
import org.broadinstitute.hellbender.utils.locusiterator.LocusIteratorByState; | ||
import org.broadinstitute.hellbender.utils.logging.OneShotLogger; | ||
import org.broadinstitute.hellbender.utils.pileup.PileupBasedAlleles; | ||
import org.broadinstitute.hellbender.utils.pileup.ReadPileup; | ||
import org.broadinstitute.hellbender.utils.read.*; | ||
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner; | ||
|
@@ -72,6 +75,8 @@ public final class AssemblyBasedCallerUtils { | |
// get realigned incorrectly. See https://github.com/broadinstitute/gatk/issues/5060 | ||
public static final int MINIMUM_READ_LENGTH_AFTER_TRIMMING = 10; | ||
|
||
public static final OneShotLogger haplotypeDeletionWarningLogger = new OneShotLogger(AssemblyBasedCallerUtils.class); | ||
|
||
// Phase group notation can be interpreted as a representation of the alleles present on the two phased haplotypes at the site: | ||
// "0": REF or '*'; "1": site-specific alt allele | ||
enum PhaseGroup { | ||
|
@@ -263,8 +268,7 @@ public static ReadLikelihoodCalculationEngine createLikelihoodCalculationEngine( | |
final boolean handleSoftclips, | ||
final ReadLikelihoodCalculationEngine.Implementation implementation) { | ||
//AlleleLikelihoods::normalizeLikelihoods uses Double.NEGATIVE_INFINITY as a flag to disable capping | ||
final double log10GlobalReadMismappingRate = likelihoodArgs.phredScaledGlobalReadMismappingRate < 0 ? Double.NEGATIVE_INFINITY | ||
: QualityUtils.qualToErrorProbLog10(likelihoodArgs.phredScaledGlobalReadMismappingRate); | ||
final double log10GlobalReadMismappingRate = getGlobalMismatchingRateFromArgs(likelihoodArgs); | ||
|
||
switch ( implementation) { | ||
// TODO these constructors should eventually be matched so they both incorporate all the same ancilliary arguments | ||
|
@@ -284,6 +288,14 @@ public static ReadLikelihoodCalculationEngine createLikelihoodCalculationEngine( | |
} | ||
} | ||
|
||
/** | ||
* Exposed so that PDHMM can be constructed outside of this class | ||
*/ | ||
public static double getGlobalMismatchingRateFromArgs(LikelihoodEngineArgumentCollection likelihoodArgs) { | ||
return likelihoodArgs.phredScaledGlobalReadMismappingRate < 0 ? Double.NEGATIVE_INFINITY | ||
: QualityUtils.qualToErrorProbLog10(likelihoodArgs.phredScaledGlobalReadMismappingRate); | ||
} | ||
|
||
public static Optional<HaplotypeBAMWriter> createBamWriter(final AssemblyBasedCallerArgumentCollection args, | ||
final boolean createBamOutIndex, | ||
final boolean createBamOutMD5, | ||
|
@@ -320,7 +332,6 @@ public static VariantContext makeMergedVariantContext(final List<VariantContext> | |
* for further HC steps | ||
*/ | ||
public static AssemblyResultSet assembleReads(final AssemblyRegion region, | ||
final List<VariantContext> forcedPileupAlleles, | ||
final AssemblyBasedCallerArgumentCollection argumentCollection, | ||
final SAMFileHeader header, | ||
final SampleList sampleList, | ||
|
@@ -397,10 +408,6 @@ public static AssemblyResultSet assembleReads(final AssemblyRegion region, | |
haplotypeCollapsing); | ||
|
||
assemblyResultSet.setHaplotypeCollapsingEngine(haplotypeCollapsing); | ||
|
||
if (!forcedPileupAlleles.isEmpty()) { | ||
processPileupAlleles(region, forcedPileupAlleles, argumentCollection.pileupDetectionArgs.snpAdajacentToAssemblyIndel, argumentCollection.maxMnpDistance, aligner, refHaplotype, assemblyResultSet, argumentCollection.pileupDetectionArgs.numHaplotypesToIterate, argumentCollection.pileupDetectionArgs.filteringKmerSize, argumentCollection.getHaplotypeToReferenceSWParameters()); | ||
} | ||
assemblyResultSet.setDebug(argumentCollection.assemblerArgs.debugAssembly); | ||
assemblyResultSet.debugDump(logger); | ||
return assemblyResultSet; | ||
|
@@ -429,15 +436,76 @@ private static int determineFlowAssemblyColapseHmer(SAMFileHeader readsHeader) { | |
return result; | ||
} | ||
|
||
/** | ||
* Helper method that handles the actual "GGA-like" Merging of haplotype alleles into an assembly result set. | ||
* | ||
* First this method will filter out haplotypes that contain alleles that have failed the pileup calling filtering steps, | ||
* Then the list will attempt to poke into the haplotype list artificial haplotypes that have the found alleles present. | ||
* | ||
* @param region | ||
* @param argumentCollection | ||
* @param aligner | ||
* @param refHaplotype | ||
* @param assemblyResultSet | ||
* @param pileupAllelesFoundShouldFilter | ||
* @param pileupAllelesPassingFilters | ||
* @return | ||
*/ | ||
public static AssemblyResultSet applyPileupEventsAsForcedAlleles(final AssemblyRegion region, final AssemblyBasedCallerArgumentCollection argumentCollection, | ||
final SmithWatermanAligner aligner, final Haplotype refHaplotype, | ||
final AssemblyResultSet assemblyResultSet, final List<VariantContext> pileupAllelesFoundShouldFilter, | ||
final List<VariantContext> pileupAllelesPassingFilters, final boolean debug) { | ||
List<Haplotype> haplotypesWithFilterAlleles = new ArrayList<>(); | ||
// IF we find pileup alleles that we want to filter... AND we are not generating the partially determined haplotypes, | ||
// we resort to a messy approach where we filter alleles by throwing away every haplotype supporting an allele. This is | ||
// very dangerous since this could easily destroy phased variants with the haplotype. | ||
if (!pileupAllelesFoundShouldFilter.isEmpty() && !argumentCollection.pileupDetectionArgs.generatePDHaplotypes) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This logic is messy and dangerous, There should be an explicit opt-in and probably a user warning printed out if we are actually throwing away haplotypes here... The non-pdhmm version of this code should really be considered an off-label approach and it really wasn't extensively tested except as a stepping stone to the full PDHmm implementation so maybe we should re-evaluate if we want this at all.. |
||
// TODO this is a bad algorithm for bad people | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But is it faster than the Java version of the pdhmm? Does it do effectively the same thing? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Its slow, faster than PDHMM by a longshot though because currently the JavaPDHMM is veeery slow |
||
for(VariantContext delVariant : pileupAllelesFoundShouldFilter) { | ||
for (Haplotype hap : assemblyResultSet.getHaplotypeList()) { | ||
if (hap.getEventMap()==null) { | ||
// NOTE: This check should be a reasonable sanity check on the inputs. However, there edge cases in the assembly engine + SW realignment that can cause | ||
// haplotypes to re-merge into the reference and end up with an empty event map. (Also the event map code explicitly expects to fail in some instances) | ||
// thus we can't enforce that the haplotypes no variants. | ||
// if (!hap.isReference()) { | ||
// //throw new RuntimeException("empty event map for haplotype" + hap); | ||
// } | ||
} else { | ||
if (hap.getEventMap().getVariantContexts().stream().anyMatch(v -> v.getStart() == delVariant.getStart() | ||
&& delVariant.getReference().equals(v.getReference()) | ||
&& delVariant.getAlternateAllele(0).equals(v.getAlternateAllele(0)))) { | ||
if (argumentCollection.pileupDetectionArgs.debugPileupStdout) System.err.println("Flagging hap " + hap + " for containing variant " + delVariant); | ||
haplotypesWithFilterAlleles.add(hap); | ||
} | ||
} | ||
} | ||
} | ||
} | ||
// TODO removing haplotypes whole-cloth is messy and will likely lead to errors and dropped variants in this code | ||
if (!haplotypesWithFilterAlleles.isEmpty()) { | ||
if (debug) System.out.println("Found Assembly Haps with filtered Variants:\n"+haplotypesWithFilterAlleles.stream().map(Haplotype::toString).collect(Collectors.joining("\n"))); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Logger.debug is polluted with a bunch of other garbage that i don't want to see when i'm debugging these things... I agree this is inelegant.... |
||
|
||
haplotypeDeletionWarningLogger.warn(() -> "Haplotypes from Assembly are being filtered by heuristics from the PileupCaller. This can lead to missing variants. See --"+PileupDetectionArgumentCollection.PILEUP_DETECTION_FILTER_ASSEMBLY_HAPS_THRESHOLD+" for more details"); | ||
|
||
for (Haplotype hap : haplotypesWithFilterAlleles) { | ||
assemblyResultSet.removeHapltotype(hap); | ||
} | ||
} | ||
if (!pileupAllelesPassingFilters.isEmpty()) { | ||
processPileupAlleles(region, pileupAllelesPassingFilters, argumentCollection.maxMnpDistance, argumentCollection.pileupDetectionArgs.snpAdajacentToAssemblyIndel, aligner, refHaplotype, assemblyResultSet, argumentCollection.pileupDetectionArgs.numHaplotypesToIterate, argumentCollection.pileupDetectionArgs.filteringKmerSize, argumentCollection.getHaplotypeToReferenceSWParameters(), debug); | ||
} | ||
return assemblyResultSet; | ||
} | ||
|
||
/** | ||
* Handle pileup detected alternate alleles. | ||
*/ | ||
@VisibleForTesting | ||
@SuppressWarnings("deprecation") | ||
static void processPileupAlleles(final AssemblyRegion region, final List<VariantContext> givenAlleles, final int maxMnpDistance, | ||
static void processPileupAlleles(final AssemblyRegion region, final List<VariantContext> pileupVC, final int maxMnpDistance, | ||
final int snpAdjacentToIndelLimit, final SmithWatermanAligner aligner, final Haplotype refHaplotype, | ||
final AssemblyResultSet assemblyResultSet, final int numHaplotypesPerIteration, final int hapFilteringKmerSize, | ||
final SWParameters haplotypeToReferenceSWParameters) { | ||
final SWParameters haplotypeToReferenceSWParameters, final boolean debug) { | ||
final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef(); | ||
final Map<Integer, VariantContext> assembledVariants = assemblyResultSet.getVariationEvents(maxMnpDistance).stream() | ||
.collect(Collectors.groupingBy(VariantContext::getStart, Collectors.collectingAndThen(Collectors.toList(), AssemblyBasedCallerUtils::makeMergedVariantContext))); | ||
|
@@ -455,7 +523,7 @@ static void processPileupAlleles(final AssemblyRegion region, final List<Variant | |
Map<Kmer, Integer> kmerReadCounts = getKmerReadCounts(region.getHardClippedPileupReads(), hapFilteringKmerSize); | ||
|
||
// Remove SNPs that are too close to assembled indels. | ||
final List<VariantContext> givenAllelesFiltered = givenAlleles.stream().filter(vc -> vc.isIndel() || assembledIndels.stream().noneMatch(indel -> vc.withinDistanceOf(indel, snpAdjacentToIndelLimit))).collect(Collectors.toList()); | ||
final List<VariantContext> givenAllelesFiltered = pileupVC.stream().filter(vc -> vc.isIndel() || assembledIndels.stream().noneMatch(indel -> vc.withinDistanceOf(indel, snpAdjacentToIndelLimit))).collect(Collectors.toList()); | ||
|
||
for (final VariantContext givenVC : givenAllelesFiltered) { | ||
final VariantContext assembledVC = assembledVariants.get(givenVC.getStart()); | ||
|
@@ -470,6 +538,7 @@ static void processPileupAlleles(final AssemblyRegion region, final List<Variant | |
|
||
final List<Haplotype> newPileupHaplotypes = new ArrayList<>(); | ||
for (final Allele givenAllele : unassembledNonSymbolicAlleles) { | ||
if (debug) System.out.println("Processing new Haplotypes for Pileup Allele that was not in the assembly: "+givenVC); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
for (final Haplotype baseHaplotype : baseHaplotypes) { | ||
// make sure this allele doesn't collide with a variant on the haplotype | ||
if (baseHaplotype.getEventMap() == null || baseHaplotype.getEventMap().getVariantContexts().stream().noneMatch(vc -> vc.overlaps(givenVC))) { | ||
|
@@ -490,7 +559,9 @@ static void processPileupAlleles(final AssemblyRegion region, final List<Variant | |
} | ||
} | ||
|
||
baseHaplotypes.addAll(filterPileupHaplotypes(newPileupHaplotypes, kmerReadCounts, numHaplotypesPerIteration, hapFilteringKmerSize)); | ||
Set<Haplotype> refactoredHaps = filterPileupHaplotypes(newPileupHaplotypes, kmerReadCounts, numHaplotypesPerIteration, hapFilteringKmerSize); | ||
baseHaplotypes.addAll(refactoredHaps); | ||
if (debug) System.out.println("Constructed the following new Pileup Haplotypes after filtering:\n"+refactoredHaps.stream().map(Haplotype::toString).collect(Collectors.joining("\n"))); | ||
|
||
} | ||
baseHaplotypes.forEach(assemblyResultSet::add); | ||
|
@@ -556,7 +627,8 @@ private static List<Allele> getAllelesNotPresentInAssembly(VariantContext givenV | |
ReferenceConfidenceVariantContextMerger.remapAlleles(assembledVC, longerRef)); | ||
final Set<Allele> givenAlleleSet = new HashSet<>(longerRef.length() == givenVCRefLength ? givenVC.getAlternateAlleles() : | ||
ReferenceConfidenceVariantContextMerger.remapAlleles(givenVC, longerRef)); | ||
unassembledGivenAlleles = givenAlleleSet.stream().filter(a -> !assembledAlleleSet.contains(a)).collect(Collectors.toList()); | ||
//We must filter out the Ref alleles here to protect against edge cases and undexpected behavior from the pileupcalling code | ||
unassembledGivenAlleles = givenAlleleSet.stream().filter(a -> !assembledAlleleSet.contains(a)).filter(a -> !a.isReference()).collect(Collectors.toList()); | ||
jamesemery marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
return unassembledGivenAlleles; | ||
} | ||
|
@@ -856,6 +928,10 @@ public static Map<Allele, List<Haplotype>> createAlleleMapper(final VariantConte | |
|
||
for (final Haplotype h : haplotypes) { | ||
|
||
// Partially determined haplotypes know at what position they are determined, only determined position haps should be considered for genotyping | ||
if (h.isPartiallyDetermined() && ((PartiallyDeterminedHaplotype) h).getDeterminedPosition() != loc) { | ||
continue; | ||
} | ||
final List<VariantContext> spanningEvents = h.getEventMap().getOverlappingEvents(loc); | ||
|
||
if (spanningEvents.isEmpty()) { //no events --> this haplotype supports the reference at this locus | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this method have a unit test? Or is there any code path covered by integration tests that exercises it?
If it's unused in the pdhmm codepath, can we just remove it? Or do we actually need it for some reason?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Its captured by some HaplotypeCallerIntegrationTest consistent-with-past-results tests... Its the old method for pileupcalling that Bhanu added for the bacterial pipeline. I leave it in for legacy reasons as its a reasonable alternative approach to the PDHMM dependent one that is more usable by other users than DRAGEN-GATK.