Merge pull request #535 from ZLyanov/zl_elc_new_algorithm_proposal

New EstimateLibraryComplexity algorithm using hashes to speed up the comparison of reads that are within the same "group"
broadinstitute · Nov 19, 2016 · f8a93c9 · f8a93c9
2 parents f0a9aaa + 39e6c2a
commit f8a93c9
Show file tree

Hide file tree

Showing 10 changed files with 3,204 additions and 116 deletions.
diff --git a/src/main/java/picard/sam/markduplicates/ElcDuplicatesFinder.java b/src/main/java/picard/sam/markduplicates/ElcDuplicatesFinder.java
@@ -0,0 +1,79 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2016 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+package picard.sam.markduplicates;
+
+import htsjdk.samtools.util.Histogram;
+import picard.sam.markduplicates.util.OpticalDuplicateFinder;
+
+import java.util.List;
+
+import static picard.sam.markduplicates.EstimateLibraryComplexity.PairedReadSequence;
+
+/**
+ * Abstract class for search duplicates algorithm which is used in the EstimateLibraryComplexity
+ *
+ * @author [email protected], EPAM Systems, Inc. <www.epam.com>
+ */
+abstract class ElcDuplicatesFinder {
+
+    protected final int maxReadLength;
+    protected final double maxDiffRate;
+    protected int minIdenticalBases;
+    protected OpticalDuplicateFinder opticalDuplicateFinder;
+
+    ElcDuplicatesFinder(double maxDiffRate, int maxReadLength, int minIdenticalBases,
+                        OpticalDuplicateFinder opticalDuplicateFinder) {
+        this.maxDiffRate = maxDiffRate;
+        this.minIdenticalBases = minIdenticalBases;
+        this.opticalDuplicateFinder = opticalDuplicateFinder;
+        this.maxReadLength = (maxReadLength <= 0) ? Integer.MAX_VALUE : maxReadLength;
+    }
+
+    abstract void searchDuplicates(List<PairedReadSequence> sequences,
+                                   Histogram<Integer> duplicationHisto, Histogram<Integer> opticalHisto);
+
+    /**
+     * Fill histograms based on duplicates.
+     */
+    protected void fillHistogram(Histogram<Integer> duplicationHisto, Histogram<Integer> opticalHisto,
+                                 PairedReadSequence prs, List<PairedReadSequence> dupes) {
+        if (!dupes.isEmpty()) {
+            dupes.add(prs);
+            final int duplicateCount = dupes.size();
+            duplicationHisto.increment(duplicateCount);
+
+            final boolean[] flags = opticalDuplicateFinder.findOpticalDuplicates(dupes, prs);
+            for (final boolean b : flags) {
+                if (b) opticalHisto.increment(duplicateCount);
+            }
+        } else {
+            duplicationHisto.increment(1);
+        }
+    }
+
+    protected int minLength(byte[] read1, byte[] read2) {
+        return Math.min(Math.min(read1.length, read2.length), maxReadLength);
+    }
+}
diff --git a/src/main/java/picard/sam/markduplicates/ElcDuplicatesFinderResolver.java b/src/main/java/picard/sam/markduplicates/ElcDuplicatesFinderResolver.java
@@ -0,0 +1,80 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2016 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+package picard.sam.markduplicates;
+
+import htsjdk.samtools.util.Histogram;
+import picard.sam.markduplicates.util.OpticalDuplicateFinder;
+
+import java.util.List;
+
+import static picard.sam.markduplicates.EstimateLibraryComplexity.PairedReadSequence;
+
+/**
+ * Resolve algorithm is used for the current PairedReadSequence group
+ *
+ * * @author [email protected], EPAM Systems, Inc. <www.epam.com>
+ */
+class ElcDuplicatesFinderResolver {
+
+    /**
+     * This parameter determines the choice of the algorithm: if the group size > BOUNDARY_LIBRARY_SIZE, the modified
+     * algorithm applies
+     */
+    private static final int BOUNDARY_LIBRARY_SIZE = 100;
+
+    private boolean useBarcodes;
+    private ElcHashBasedDuplicatesFinder hashBasedDuplicatesFinder;
+    private ElcIdenticalBasesDuplicatesFinder identicalBasesDuplicateFinder;
+
+    ElcDuplicatesFinderResolver(double maxDiffRate, int maxReadLength, int minIdenticalBases, boolean useBarcodes,
+                                OpticalDuplicateFinder opticalDuplicateFinder) {
+        this.useBarcodes = useBarcodes;
+
+        this.hashBasedDuplicatesFinder = new ElcHashBasedDuplicatesFinder(
+                maxDiffRate,
+                maxReadLength,
+                minIdenticalBases,
+                opticalDuplicateFinder
+        );
+
+        this.identicalBasesDuplicateFinder = new ElcIdenticalBasesDuplicatesFinder(
+                maxDiffRate,
+                maxReadLength,
+                minIdenticalBases,
+                useBarcodes,
+                opticalDuplicateFinder
+        );
+    }
+
+    void resolveAndSearch(List<PairedReadSequence> sequences, Histogram<Integer> duplicationHisto,
+                          Histogram<Integer> opticalHisto) {
+        if (useBarcodes || sequences.size() < BOUNDARY_LIBRARY_SIZE) {
+            identicalBasesDuplicateFinder.searchDuplicates(sequences, duplicationHisto, opticalHisto);
+        } else {
+            hashBasedDuplicatesFinder.searchDuplicates(sequences, duplicationHisto, opticalHisto);
+        }
+    }
+
+}
diff --git a/src/main/java/picard/sam/markduplicates/ElcHashBasedDuplicatesFinder.java b/src/main/java/picard/sam/markduplicates/ElcHashBasedDuplicatesFinder.java
@@ -0,0 +1,216 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2016 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+package picard.sam.markduplicates;
+
+import htsjdk.samtools.util.Histogram;
+import picard.sam.markduplicates.util.OpticalDuplicateFinder;
+
+import java.util.*;
+
+import static picard.sam.markduplicates.EstimateLibraryComplexity.PairedReadSequence;
+
+/**
+ * Algorithm for search duplicates is used in EstimateLibraryComplexity only. It splits each PairedReadSequence
+ * on maximum error number + 1 and hash them. To find a duplicate we need to compare hashes and if they are similar
+ * we compare bases corresponding to the hashes.
+ *
+ * * @author [email protected], EPAM Systems, Inc. <www.epam.com>
+ */
+class ElcHashBasedDuplicatesFinder extends ElcDuplicatesFinder {
+
+    // We split each read on several parts and each part has length hashLength.
+    // hashLength = (min read length) / (number of errors + 1)
+    private int hashLength = Integer.MAX_VALUE;
+
+    ElcHashBasedDuplicatesFinder(double maxDiffRate, int maxReadLength, int minIdenticalBases,
+                                 OpticalDuplicateFinder opticalDuplicateFinder) {
+        super(maxDiffRate, maxReadLength, minIdenticalBases, opticalDuplicateFinder);
+    }
+
+    @Override
+    void searchDuplicates(List<PairedReadSequence> sequences, Histogram<Integer> duplicationHisto,
+                          Histogram<Integer> opticalHisto) {
+
+        initHashLength(sequences);
+        fillHashValues(sequences);
+        populateDupCandidates(sequences);
+
+        //Duplicate set to filter the treated PairedReadSequences out
+        Set<PairedReadSequence> dupSet = new HashSet<>();
+        for (PairedReadSequence lhs : sequences) {
+            if (dupSet.contains(lhs)) continue;
+
+            final List<PairedReadSequence> dupes = new ArrayList<>();
+
+            for (PairedReadSequence rhs : lhs.dupCandidates) {
+                if (dupSet.contains(rhs)) continue;
+
+                if (isDuplicate(lhs, rhs)) {
+                    dupes.add(rhs);
+                }
+            }
+
+            dupSet.addAll(dupes);
+            fillHistogram(duplicationHisto, opticalHisto, lhs, dupes);
+        }
+    }
+
+    /**
+     * Populate PairedReadSequence.dupCandidates with duplicate candidates
+     * based on the same hash on the same position.
+     */
+    private void populateDupCandidates(List<PairedReadSequence> seqs) {
+        // Contains hash value as a key and PairedReadSequences match this key as a value
+        Map<Integer, List<PairedReadSequence>> readsByHash = new HashMap<>();
+
+        // Iterate over all PairedReadSequence and split in sets according to the hash value 
+        // in a certain position
+        for (PairedReadSequence prs : seqs) {
+            int[][] readHashValues = {prs.hashes1, prs.hashes2};
+            for (int[] readHashValue : readHashValues) {
+                for (int key : readHashValue) {
+                    List<PairedReadSequence> dupCandidates = readsByHash.get(key);
+
+                    if (dupCandidates == null) {
+                        dupCandidates = new ArrayList<>();
+                        readsByHash.put(key, dupCandidates);
+                    }
+                    dupCandidates.add(prs);
+                }
+            }
+        }
+
+        // Fill for each PairedReadSequence a set of possible duplicates
+        for (List<PairedReadSequence> dupCandidates : readsByHash.values()) {
+            for (PairedReadSequence prs : dupCandidates) {
+                prs.dupCandidates.addAll(dupCandidates);
+            }
+        }
+    }
+
+    /**
+     * Populate hashes for each PairedReadSequence.
+     */
+    private void fillHashValues(List<PairedReadSequence> sequences) {
+        for (PairedReadSequence prs : sequences) {
+            prs.initHashes(maxReadLength, hashLength, minIdenticalBases);
+        }
+    }
+
+    /**
+     * Calculate hash length based on minReadLength and hashNumber
+     */
+    private void initHashLength(List<PairedReadSequence> sequences) {
+        if (sequences == null || sequences.isEmpty()) {
+            return;
+        }
+        for (PairedReadSequence prs : sequences) {
+            int minReadLength = Math.min(
+                    Math.min(prs.read1.length, prs.read2.length) - minIdenticalBases,
+                    maxReadLength - minIdenticalBases
+            );
+
+            //if read.length % shingle.length != 0, we have tail of the read which is not included in the hashValues,
+            // and we need extra hash value to get prs.hashValues.length = (number of errors) + 1
+            int hashNumber = (int) (minReadLength * maxDiffRate) + 1;
+            int currentHashLength = minReadLength / hashNumber;
+            if (hashLength > currentHashLength) {
+                hashLength = currentHashLength;
+            }
+        }
+    }
+
+    private int getReadOffset(int k) {
+        return k * hashLength + minIdenticalBases;
+    }
+
+    private boolean isDuplicate(final PairedReadSequence lhs, final PairedReadSequence rhs) {
+        // self is not duplicate
+        if (lhs == rhs) {
+            return false;
+        }
+
+        final int read1Length = minLength(lhs.read1, rhs.read1);
+        final int read2Length = minLength(lhs.read2, rhs.read2);
+        final int maxErrors = (int) Math.floor((read1Length + read2Length) * maxDiffRate);
+
+        int errors = compareReadToRead(
+                lhs.read1, lhs.hashes1,
+                rhs.read1, rhs.hashes1,
+                maxErrors
+        );
+
+        if (errors > maxErrors) {
+            return false;
+        }
+
+        errors += compareReadToRead(
+                lhs.read2, lhs.hashes2,
+                rhs.read2, rhs.hashes2,
+                maxErrors
+        );
+
+        return errors <= maxErrors;
+    }
+
+    /**
+     * Compare hashes and if they are similar we compare bases corresponding to the hashes.
+     */
+    private int compareReadToRead(byte[] read1, int[] hashes1, byte[] read2, int[] hashes2, int maxErrors) {
+        int errors = 0;
+        final int minReadLength = minLength(read1, read2);
+        int minHashesLength = Math.min(hashes1.length, hashes2.length);
+
+        for (int k = 0; k < minHashesLength; ++k) {
+            if (hashes1[k] != hashes2[k]) {
+                errors += compareHashes(read1, read2, getReadOffset(k), getReadOffset((k + 1)));
+                if (errors > maxErrors) {
+                    return errors;
+                }
+            }
+        }
+
+        if (minReadLength > getReadOffset(minHashesLength)) {
+            errors += compareHashes(read1, read2, getReadOffset(minHashesLength), minReadLength);
+        }
+
+        return errors;
+    }
+
+    /**
+     * Compare bases corresponding to the hashes.
+     *
+     * @return errors number for current part of the read
+     */
+    private int compareHashes(byte[] read1, byte[] read2, int start, int stop) {
+        int errors = 0;
+        for (int i = start; i < stop; ++i) {
+            if (read1[i] != read2[i]) {
+                errors++;
+            }
+        }
+        return errors;
+    }
+}