Skip to content

Commit

Permalink
Optimize AbstractBAMFileIndex.query() when querying sequentially (#1397)
Browse files Browse the repository at this point in the history
* Optimize AbstractBAMFileIndex.query to avoid quadratic behavior when reading sequences in order.


Co-authored-by: Louis Bergelson <[email protected]>
  • Loading branch information
lbergelson authored Jul 3, 2019
1 parent 9aa81ed commit edb8c60
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 3 deletions.
15 changes: 13 additions & 2 deletions src/main/java/htsjdk/samtools/AbstractBAMFileIndex.java
Original file line number Diff line number Diff line change
Expand Up @@ -398,14 +398,25 @@ protected void readChunks(int nChunks, List<Chunk> chunks) {
}
}

private void skipToSequence(final int sequenceIndex) {
protected void skipToSequence(final int sequenceIndex) {
//Use sequence position cache if available
if(sequenceIndexes[sequenceIndex] != -1){
seek(sequenceIndexes[sequenceIndex]);
return;
}

// Use previous sequence position if in cache, which optimizes for common access pattern
// of iterating through sequences in order.
final int startSequenceIndex;
final int previousSequenceIndex = sequenceIndex - 1;
if (sequenceIndex > 0 && sequenceIndexes[previousSequenceIndex] != -1) {
seek(sequenceIndexes[previousSequenceIndex]);
startSequenceIndex = previousSequenceIndex;
} else {
startSequenceIndex = 0;
}

for (int i = 0; i < sequenceIndex; i++) {
for (int i = startSequenceIndex; i < sequenceIndex; i++) {
// System.out.println("# Sequence TID: " + i);
final int nBins = readInteger();
// System.out.println("# nBins: " + nBins);
Expand Down
3 changes: 2 additions & 1 deletion src/main/java/htsjdk/samtools/CSIIndex.java
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,8 @@ public BAMIndexContent getQueryResults(final int referenceSequence) {
return query(referenceSequence, 1, -1);
}

private void skipToSequence(final int sequenceIndex) {
@Override
protected void skipToSequence(final int sequenceIndex) {
if(sequenceIndex > getNumberOfReferences()) {
throw new SAMException("Sequence index (" + sequenceIndex + ") is greater than maximum (" + getNumberOfReferences() + ").");
}
Expand Down
30 changes: 30 additions & 0 deletions src/test/java/htsjdk/samtools/CachingBAMFileIndexTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -121,4 +121,34 @@ private static void assertCacheStats(CachingBAMFileIndex index, int hits, int mi
Assert.assertEquals(index.getCacheMisses(), misses, "cache misses didn't match expected");
}


/**
* This tests that the optimization for querying forwards in https://github.com/samtools/htsjdk/pull/1396
* returns the same results as the unoptimized queries going backwards.
*/
@Test
public void testSequentialCachingOptimizationDoesntBreakThings() throws IOException {
final BAMIndexContent[] forward = new BAMIndexContent[200];
final BAMIndexContent[] reverse = new BAMIndexContent[200];

try(final CachingBAMFileIndex index = getIndexWith200Contigs()) {
//iterate forward which triggers the optimized path
for (int i = 1; i <= 200; i++) {
final BAMIndexContent query = index.query(i, 1, -1);
forward[i - 1] = query;
}
}

try(final CachingBAMFileIndex index = getIndexWith200Contigs()) {
//iterate backwards which doesn't use the optimized path
for (int i = 200; i > 0; i--) {
final BAMIndexContent query = index.query(i, 1, -1);
reverse[i - 1] = query;
}
}

for( int i = 0; i < 200; i++){
Assert.assertEquals(forward[i], reverse[i]);
}
}
}

0 comments on commit edb8c60

Please sign in to comment.