Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed a bug in pileup mode relating to number of haplotypes #8489

Merged
merged 10 commits into from
Aug 28, 2023
Original file line number Diff line number Diff line change
Expand Up @@ -470,12 +470,22 @@ static Set<Haplotype> filterPileupHaplotypes(final Set<Haplotype> onlyNewHaploty
.filter(kmer -> kmerReadCounts.getOrDefault(kmer, new MutableInt(0)).intValue() > 0)
.count()));

// if there are ties, we pass any haplotype with a score as good as the numPileupHaplotypes-th best
final long minimumPassingScore = scores.values().stream().sorted(Comparator.reverseOrder()).skip(numPileupHaplotypes - 1).findFirst().get();
return onlyNewHaplotypes.stream().filter(h -> scores.get(h) >= minimumPassingScore).collect(Collectors.toSet());
// Get the minimum passing score from all haplotypes:
final long minimumPassingScore = scores.values().stream()
.sorted(Comparator.reverseOrder())
.skip(numPileupHaplotypes - 1)
.findFirst().get();

// If there are ties, we pass any haplotype with a score as good as the numPileupHaplotypes-th best, with
// final ordering determined by string representation (for determinism).
return onlyNewHaplotypes.stream()
.filter(h -> scores.get(h) >= minimumPassingScore)
.sorted(Comparator.<Haplotype, Long>comparing(scores::get).reversed()
.thenComparing(Haplotype::getBaseString)
).limit(numPileupHaplotypes)
.collect(Collectors.toCollection(LinkedHashSet::new));
}


private static Set<Kmer> kmerizeSequence(byte[] sequence, int kmerSize){
final Set<Kmer> allKmers = new LinkedHashSet<>();
final int stopPosition = sequence.length - kmerSize;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -675,23 +675,27 @@
return; // nothing to do
}

if (debug) {
logger.info("Number of haplotypes pre-pileup injection: " + this.getHaplotypeCount());

Check warning on line 679 in src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyResultSet.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyResultSet.java#L679

Added line #L679 was not covered by tests
}

final Haplotype refHaplotype = getReferenceHaplotype();
final Map<Integer, List<Event>> assembledEventByStart = getVariationEvents(argumentCollection.maxMnpDistance).stream()
.collect(Collectors.groupingBy(Event::getStart));
final Collection<Event> assembledIndels = getVariationEvents(argumentCollection.maxMnpDistance).stream().
filter(Event::isIndel).collect(Collectors.toList());
filter(Event::isIndel).toList();

Set<Haplotype> baseHaplotypes = new TreeSet<>();
baseHaplotypes.addAll(getHaplotypeList().stream()
.sorted(Comparator.comparingInt((Haplotype hap) -> hap.isReference() ? 1 : 0).thenComparingDouble(hap -> hap.getScore()).reversed())
.sorted(Comparator.comparingInt((Haplotype hap) -> hap.isReference() ? 1 : 0).thenComparingDouble(Haplotype::getScore).reversed())
.limit(AssemblyBasedCallerUtils.NUM_HAPLOTYPES_TO_INJECT_FORCE_CALLING_ALLELES_INTO)
.collect(Collectors.toList()));
.toList());

//TODO its unclear whether the correct answer here is to use the hardclipped pileup reads (which we used in generating the pileup alleles for specificty reasons)
//TODO or if it would be more accurate to use the softclipped bases here in filtering down the haplotypes. I suspect the latter but I will evaluate later.
Map<Kmer, MutableInt> kmerReadCounts = AssemblyBasedCallerUtils.getKmerReadCounts(region.getHardClippedPileupReads(), argumentCollection.pileupDetectionArgs.filteringKmerSize);

for (final Event event : goodPileupEvents.stream().sorted(Comparator.comparingInt(Event::getStart)).collect(Collectors.toList())) {
for (final Event event : goodPileupEvents.stream().sorted(Comparator.comparingInt(Event::getStart)).toList()) {

if (argumentCollection.pileupDetectionArgs.debugPileupStdout) System.out.println("Processing new Haplotypes for Pileup Allele that was not in the assembly: " + event);

Expand All @@ -705,7 +709,7 @@
continue;
}

final Set<Haplotype> newPileupHaplotypes = new HashSet<>();
final Set<Haplotype> newPileupHaplotypes = new LinkedHashSet<>();
for (final Haplotype baseHaplotype : baseHaplotypes) {
final Haplotype insertedHaplotype = makeHaplotypeWithInsertedEvent(baseHaplotype, refHaplotype, event, aligner, argumentCollection.getHaplotypeToReferenceSWParameters());
if (insertedHaplotype != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1309,28 +1309,37 @@ public Object[][] filterPileupHaplotypesDataProvider() {
put(new Kmer("GAA"), 1);
}};

final List<Haplotype> bigHaplotypeList = Arrays.asList(hapA,hapB,hapB,hapB,hapB,hapB,hapC,hapD,hapF,hapF,hapF,hapF,hapF,hapF);

// NOTE: we limit the number of haplotypes that are returned by the `numPileupHaplotypes` parameter.

Object[][] tests = new Object[][] {
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),flatSupportAllKmers,5,3,Arrays.asList(hapA,hapB,hapC,hapD)}, //returns all when no filtering required

// These haplotypes are all equivalent, these test stability of the filtering
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),flatSupportAllKmers,1,3,Arrays.asList(hapA,hapB,hapC,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),flatSupportAllKmers,2,3,Arrays.asList(hapA,hapB,hapC,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),flatSupportAllKmers,3,3,Arrays.asList(hapA,hapB,hapC,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),flatSupportAllKmers,1,3, List.of(hapD) },
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),flatSupportAllKmers,2,3,Arrays.asList(hapA,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),flatSupportAllKmers,3,3,Arrays.asList(hapA,hapC,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),flatSupportAllKmers,4,3,Arrays.asList(hapA,hapB,hapC,hapD)},

// Repetitive kmers in hapF don't get double counted
new Object[]{Arrays.asList(hapA,hapB,hapD,hapF),hapFRepeatedKmers,2,3,Arrays.asList(hapF,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapD,hapF),hapFRepeatedKmers,1,3,Arrays.asList(hapF, hapD)}, //currently repeated kmers only count as singular evidence
new Object[]{Arrays.asList(hapA,hapB,hapD,hapF),hapFRepeatedKmers,2,3,Arrays.asList(hapF, hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapD,hapF),hapFRepeatedKmers,1,3, List.of(hapD) }, //currently repeated kmers only count as singular evidence

// These tests demonstrate that the weights in the map don't matter
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),hapDKmersHighSupport,1,3,Arrays.asList(hapA,hapB,hapC,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),hapDKmersHighSupport,2,3,Arrays.asList(hapA,hapB,hapC,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),hapDKmersHighSupport,3,3,Arrays.asList(hapA,hapB,hapC,hapD)}, // Despite hapD having good support it is not weighted higher
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),hapDKmersHighSupport,1,3, List.of(hapD) },
jonn-smith marked this conversation as resolved.
Show resolved Hide resolved
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),hapDKmersHighSupport,2,3,Arrays.asList(hapA,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),hapDKmersHighSupport,3,3,Arrays.asList(hapA,hapC,hapD)}, // Despite hapD having good support it is not weighted higher

// Test of the output when only one hap has support
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD,hapF),hapDKmers,1,3,Arrays.asList(hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD,hapF),hapDKmers,2,3,Arrays.asList(hapD,hapA, hapC)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD,hapF),hapDKmers,1,3, List.of(hapD) },
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD,hapF),hapDKmers,2,3,Arrays.asList(hapD,hapA)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD,hapF),hapDKmers,3,3,Arrays.asList(hapD,hapA,hapC)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD,hapF),hapDKmers,4,3,Arrays.asList(hapD,hapA,hapC,hapB,hapF)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD,hapF),hapDKmers,4,3,Arrays.asList(hapD,hapA,hapC,hapB)},

// Test of when there are a LOT of haplotypes and we want to limit the number returned:
new Object[]{bigHaplotypeList, hapDKmers, 1, 3, List.of(hapD)},
new Object[]{bigHaplotypeList, hapDKmers, 2, 3, List.of(hapA, hapD)},
};

return tests;
Expand All @@ -1345,7 +1354,8 @@ public void testFilterPileupHaplotypes(final List<Haplotype> inputHaplotypes,
final List<Haplotype> expected) {
final Map<Kmer, MutableInt> counts = kmerReadCounts.entrySet().stream()
.collect(Collectors.toMap(entry -> entry.getKey(), entry -> new MutableInt(entry.getValue())));
Set<Haplotype> actual = AssemblyBasedCallerUtils.filterPileupHaplotypes(new HashSet<>(inputHaplotypes), counts, numPileupHaplotypes, kmerSize);

final Set<Haplotype> actual = AssemblyBasedCallerUtils.filterPileupHaplotypes(new HashSet<>(inputHaplotypes), counts, numPileupHaplotypes, kmerSize);

Assert.assertEquals(actual, new HashSet<>(expected));
}
Expand Down
Loading