Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

making changes to reduce size of giant interval lists #1309

Merged
merged 3 commits into from
Mar 4, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion src/main/java/htsjdk/samtools/util/IntervalList.java
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,7 @@ public static IntervalList fromReader(final BufferedReader in) {

// Then read in the intervals
final FormatUtil format = new FormatUtil();
String lastSeq = null;
do {
if (line.trim().isEmpty()) {
continue; // skip over blank lines
Expand All @@ -544,7 +545,12 @@ public static IntervalList fromReader(final BufferedReader in) {
}

// Then parse them out
final String seq = fields[SEQUENCE_POS];
String seq = fields[SEQUENCE_POS];
if (seq.equals(lastSeq)) {
seq = lastSeq;
}
lastSeq = seq;

final int start = format.parseInt(fields[START_POS]);
final int end = format.parseInt(fields[END_POS]);
if (start < 1) {
Expand Down
35 changes: 34 additions & 1 deletion src/main/java/htsjdk/samtools/util/IntervalTree.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,13 @@
*/
package htsjdk.samtools.util;

import htsjdk.utils.ValidationUtils;

import java.util.ConcurrentModificationException;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.Objects;
import java.util.function.BiFunction;

/**
* A Red-Black tree with intervals for keys.
Expand Down Expand Up @@ -64,7 +68,6 @@ public void clear()
* @param value The associated value.
* @return The old value associated with that interval, or the sentinel.
*/
@SuppressWarnings("null")
public V put( final int start, final int end, final V value )
{
if ( start > end )
Expand Down Expand Up @@ -114,6 +117,36 @@ public V put( final int start, final int end, final V value )
return result;
}

/**
* If the specified start and end positions are not already associated with a value or are
* associated with the sentinel ( see {@link #getSentinel()}, associates it with the given (non-sentinel) value.
* Otherwise, replaces the associated value with the results of the given
* remapping function, or removes if the result is equal to the sentinel value. This
* method may be of use when combining multiple values that have the same start and end position.
*
* @param start interval start position
* @param end interval end position
* @param value value to merge into the tree, must not be equal to the sentinel value
* @param remappingFunction a function that merges the new value with the existing value for the same start and end position,
* if the function returns the sentinel value then the mapping will be unset
* @return the updated value that is stored in the tree after the completion of this merge operation, this will
* be the sentinel value if nothing ended up being stored
*/
public V merge(int start, int end, V value, BiFunction<? super V, ? super V, ? extends V> remappingFunction) {
ValidationUtils.validateArg(!Objects.equals(value, mSentinel), "Values equal to the sentinel value may not be merged");
final V alreadyPresent = put(start, end, value);
if (!Objects.equals(alreadyPresent, mSentinel)) {
final V newComputedValue = remappingFunction.apply(value, alreadyPresent);
if (Objects.equals(newComputedValue, mSentinel)) {
remove(start, end);
} else {
put(start, end, newComputedValue);
}
return newComputedValue;
}
return value;
}

/**
* Remove an interval from the tree. If the interval does not exist in the tree the
* special sentinel value is returned.
Expand Down
25 changes: 17 additions & 8 deletions src/main/java/htsjdk/samtools/util/OverlapDetector.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
package htsjdk.samtools.util;

import java.util.*;
import java.util.function.BiFunction;

/**
* Utility class to efficiently do in memory overlap detection between a large
Expand Down Expand Up @@ -80,19 +81,27 @@ public void addLhs(final T object, final Locatable interval) {
}

final int start = interval.getStart() + this.lhsBuffer;
final int end = interval.getEnd() - this.lhsBuffer;
final int end = interval.getEnd() - this.lhsBuffer;

final Set<T> objects = new HashSet<>(1);
objects.add(object);
if (start <= end) { // Don't put in sequences that have no overlappable bases
final Set<T> alreadyThere = tree.put(start, end, objects);
if (alreadyThere != null) {
alreadyThere.add(object);
tree.put(start, end, alreadyThere);
}
tree.merge(start, end, Collections.singleton(object), mergeSetsAccountingForSingletons());
}
}

/**
* merge two Sets, assumes sets of size 1 are immutale
*/
private static <T> BiFunction<Set<T>, Set<T>, Set<T>> mergeSetsAccountingForSingletons() {
return (newValue, oldValue) -> {
// Sets of size 1 are immutable SingletonSets so we have to make a new
// mutable one to add to
final Set<T> mutableSet = oldValue.size() == 1 ? new HashSet<>() : oldValue;
mutableSet.addAll(oldValue);
mutableSet.addAll(newValue);
return mutableSet;
};
}

/**
* Adds all items to the overlap detector.
*
Expand Down
32 changes: 32 additions & 0 deletions src/test/java/htsjdk/samtools/util/IntervalTreeTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,12 @@
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.function.BiFunction;
import java.util.function.UnaryOperator;

import static htsjdk.samtools.util.IntervalTree.Node.HAS_OVERLAPPING_PART;

Expand Down Expand Up @@ -310,4 +315,31 @@ public void testRemove() {
}


@DataProvider
public Object[][] getMergeTestCases(){
return new Object[][]{
{add, null, Arrays.asList(0, 1, 2 , 3), 6},
{add, null, Arrays.asList(0, 1, 2), 3},
{add, null, Arrays.asList(0, 1), 1},
{add, null, Collections.singletonList(0), 0},
{addButCountNullAs10, null, Arrays.asList(1, 2, 3), 6},
{addButCountNullAs10, 0, Arrays.asList(1, 2, null), 13},
{addButCountNullAs10, 0, Arrays.asList(null, null, null), 30}
};
}

private static final BiFunction<Integer, Integer, Integer> add = (a, b) -> a + b;
private static final BiFunction<Integer, Integer, Integer> addButCountNullAs10 = (a, b) -> (a == null ? 10 : a) + (b == null ? 10 : b);

@Test(dataProvider = "getMergeTestCases")
public void testMerge( BiFunction<Integer, Integer, Integer> function, Integer sentinel, List<Integer> values, Integer expected){
final IntervalTree<Integer> tree = new IntervalTree<>();
tree.setSentinel(sentinel);
values.forEach(value -> {
tree.merge(10, 20, value, function);
});
final Iterator<IntervalTree.Node<Integer>> iterator = tree.iterator();
Assert.assertEquals(iterator.next().getValue(), expected);
Assert.assertFalse(iterator.hasNext());
}
}
38 changes: 38 additions & 0 deletions src/test/java/htsjdk/samtools/util/OverlapDetectorTest.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package htsjdk.samtools.util;

import htsjdk.HtsjdkTest;
import htsjdk.tribble.SimpleFeature;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
Expand Down Expand Up @@ -238,4 +239,41 @@ public void testAddTwice() throws Exception {
Assert.assertEquals(overlaps.size(), 1);
Assert.assertEquals(overlaps, Collections.singleton(new Interval("1",10,100)));
}

@Test
public void testMultipleIntervalsSameCoordinates(){
final List<Locatable> input = Arrays.asList(
new IntervalWithNameInEquals("1", 10, 100, false, "same coords 1"),
new IntervalWithNameInEquals("1",10,100, true, "same coords 2"),
new IntervalWithNameInEquals("1", 10, 100, true, "same coords 3"),
new IntervalWithNameInEquals("1", 150, 250, false, "not same coordinates")
);
final OverlapDetector<Locatable> detector = OverlapDetector.create(input);
final Set<Locatable> overlaps = detector.getOverlaps(new Interval("1", 50, 200));
Assert.assertEquals(overlaps.size(), 4);
Assert.assertEquals(overlaps, new HashSet<>(input));
}

/**
* small class to allow creating intervals that have the same coordinates but don't evaluate as equal to each other
*/
private static class IntervalWithNameInEquals extends Interval{

public IntervalWithNameInEquals(String sequence, int start, int end, boolean negative, String name) {
super(sequence, start, end, negative, name);
}

@Override
public boolean equals(Object other){
if (super.equals(other)) {
return getName().equals(((Interval)other).getName());
}
return false;
}

@Override
public int hashCode(){
return super.hashCode() * getName().hashCode();
}
}
}