Skip to content

Commit

Permalink
Added indel support to SamLocusIterator (#408)
Browse files Browse the repository at this point in the history
* Added indel tracking to SamLocusIterator (as discussed in issue #387) and some tests. The default behavior is still do not keep reads spanning indels.
* resolved #419 
* several tests included
  • Loading branch information
magicDGS authored and Yossi Farjoun committed May 29, 2016
1 parent 52df499 commit 5196d09
Show file tree
Hide file tree
Showing 3 changed files with 698 additions and 148 deletions.
5 changes: 2 additions & 3 deletions src/main/java/htsjdk/samtools/SAMRecordSetBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -217,12 +217,11 @@ private SAMRecord createReadNoFlag(final String name, final int contig, final in
final int defaultQuality) throws SAMException {
final SAMRecord rec = new SAMRecord(this.header);
rec.setReadName(name);
if (chroms.length <= contig) {
throw new SAMException("Contig too big [" + chroms.length + " < " + contig);
if (header.getSequenceDictionary().size() <= contig) {
throw new SAMException("Contig too big [" + header.getSequenceDictionary().size() + " < " + contig);
}
if (0 <= contig) {
rec.setReferenceIndex(contig);
rec.setReferenceName(chroms[contig]);
rec.setAlignmentStart(start);
}
if (!recordUnmapped) {
Expand Down
229 changes: 178 additions & 51 deletions src/main/java/htsjdk/samtools/util/SamLocusIterator.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,23 +23,10 @@
*/
package htsjdk.samtools.util;

import htsjdk.samtools.AlignmentBlock;
import htsjdk.samtools.SAMException;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.filter.AggregateFilter;
import htsjdk.samtools.filter.DuplicateReadFilter;
import htsjdk.samtools.filter.FilteringSamIterator;
import htsjdk.samtools.filter.SamRecordFilter;
import htsjdk.samtools.filter.SecondaryOrSupplementaryFilter;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import htsjdk.samtools.*;
import htsjdk.samtools.filter.*;

import java.util.*;

/**
* Iterator that traverses a SAM File, accumulating information on a per-locus basis.
Expand Down Expand Up @@ -75,12 +62,16 @@ public RecordAndOffset(final SAMRecord record, final int offset) {

/**
* The unit of iteration. Holds information about the locus (the SAMSequenceRecord and 1-based position
* on the reference), plus List of ReadAndOffset objects, one for each read that overlaps the locus
* on the reference), plus List of ReadAndOffset objects, one for each read that overlaps the locus;
* two more List_s_ of ReadAndOffset objects include reads that overlap the locus with insertions and deletions
* respectively
*/
public static final class LocusInfo implements Locus {
private final SAMSequenceRecord referenceSequence;
private final int position;
private final List<RecordAndOffset> recordAndOffsets = new ArrayList<RecordAndOffset>(100);
private List<RecordAndOffset> deletedInRecord = null;
private List<RecordAndOffset> insertedInRecord = null;

LocusInfo(final SAMSequenceRecord referenceSequence, final int position) {
this.referenceSequence = referenceSequence;
Expand All @@ -92,6 +83,25 @@ public void add(final SAMRecord read, final int position) {
recordAndOffsets.add(new RecordAndOffset(read, position));
}

/** Accumulate info for one read with a deletion */
public void addDeleted(final SAMRecord read, int previousPosition) {
if (deletedInRecord == null) {
deletedInRecord = new ArrayList<>();
}
deletedInRecord.add(new RecordAndOffset(read, previousPosition));
}

/**
* Accumulate info for one read with an insertion.
* For this locus, the reads in the insertion are included also in recordAndOffsets
*/
public void addInserted(final SAMRecord read, int firstPosition) {
if (insertedInRecord == null) {
insertedInRecord = new ArrayList<>();
}
insertedInRecord.add(new RecordAndOffset(read, firstPosition));
}

public int getSequenceIndex() { return referenceSequence.getSequenceIndex(); }

/** @return 1-based reference position */
Expand All @@ -100,6 +110,22 @@ public void add(final SAMRecord read, final int position) {
public String getSequenceName() { return referenceSequence.getSequenceName(); }
@Override public String toString() { return referenceSequence.getSequenceName() + ":" + position; }
public int getSequenceLength() {return referenceSequence.getSequenceLength();}
public List<RecordAndOffset> getDeletedInRecord() {
return (deletedInRecord == null) ? Collections.emptyList() : Collections.unmodifiableList(deletedInRecord);
}
public List<RecordAndOffset> getInsertedInRecord() {
return (insertedInRecord == null) ? Collections.emptyList() : Collections.unmodifiableList(insertedInRecord);
}

/**
* @return <code>true</code> if all the RecordAndOffset lists are empty;
* <code>false</code> if at least one have records
*/
public boolean isEmpty() {
return recordAndOffsets.isEmpty() &&
(deletedInRecord == null || deletedInRecord.isEmpty()) &&
(insertedInRecord == null || insertedInRecord.isEmpty());
}
}


Expand Down Expand Up @@ -155,6 +181,12 @@ public void add(final SAMRecord read, final int position) {
// Set to true when we have enforced the accumulation limit for the first time
private boolean enforcedAccumulationLimit = false;

/**
* If true, include indels in the LocusInfo
*/
private boolean includeIndels = false;


// When there is a target mask, these members remember the last locus for which a LocusInfo has been
// returned, so that any uncovered locus in the target mask can be covered by a 0-coverage LocusInfo
private int lastReferenceSequence = 0;
Expand Down Expand Up @@ -300,8 +332,14 @@ public LocusInfo next() {
continue;
}

final Locus alignmentStart = new LocusImpl(rec.getReferenceIndex(), rec.getAlignmentStart());

int start = rec.getAlignmentStart();
// only if we are including indels and the record does not start in the first base of the reference
// the stop locus to populate the queue is not the same if the record starts with an insertion
if (includeIndels && start != 1 && startWithInsertion(rec.getCigar())) {
// the start to populate is one less
start--;
}
final Locus alignmentStart = new LocusImpl(rec.getReferenceIndex(), start);
// emit everything that is before the start of the current read, because we know no more
// coverage will be accumulated for those loci.
while (!accumulator.isEmpty() && locusComparator.compare(accumulator.get(0), alignmentStart) < 0) {
Expand All @@ -316,17 +354,22 @@ public LocusInfo next() {
}

// at this point, either the accumulator list is empty or the head should
// be the same position as the first base of the read
// be the same position as the first base of the read (or insertion if first)
if (!accumulator.isEmpty()) {
if (accumulator.get(0).getSequenceIndex() != rec.getReferenceIndex() ||
accumulator.get(0).position != rec.getAlignmentStart()) {
accumulator.get(0).position != start) {
throw new IllegalStateException("accumulator should be empty or aligned with current SAMRecord");
}
}

// Store the loci for the read in the accumulator
if (!surpassedAccumulationThreshold()) accumulateSamRecord(rec);

if (!surpassedAccumulationThreshold()) {
accumulateSamRecord(rec);
// Store the indels if requested
if (includeIndels) {
accumulateIndels(rec);
}
}
samIterator.next();
}

Expand Down Expand Up @@ -368,46 +411,89 @@ private boolean surpassedAccumulationThreshold() {
return surpassesThreshold;
}

/**
* Check if cigar start with an insertion, ignoring other operators that do not consume references bases
* @param cigar the cigar
* @return <code>true</code> if the first operator to consume reference bases or be an insertion, is an insertion; <code>false</code> otherwise
*/
private static boolean startWithInsertion(final Cigar cigar) {
for (final CigarElement element : cigar.getCigarElements()) {
if (element.getOperator()==CigarOperator.I) return true;
if (!element.getOperator().consumesReferenceBases()) continue;
break;
}
return false;
}

/**
* Capture the loci covered by the given SAMRecord in the LocusInfos in the accumulator,
* creating new LocusInfos as needed.
*/
private void accumulateSamRecord(final SAMRecord rec) {
final SAMSequenceRecord ref = getReferenceSequence(rec.getReferenceIndex());
final int alignmentStart = rec.getAlignmentStart();
final int alignmentEnd = rec.getAlignmentEnd();
final int alignmentLength = alignmentEnd - alignmentStart;

// Ensure there are LocusInfos up to and including this position
for (int i=accumulator.size(); i<=alignmentLength; ++i) {
accumulator.add(new LocusInfo(ref, alignmentStart + i));
}
// get the accumulator offset
int accOffset = getAccumulatorOffset(rec);

final int minQuality = getQualityScoreCutoff();
final boolean dontCheckQualities = minQuality == 0;
final byte[] baseQualities = dontCheckQualities ? null : rec.getBaseQualities();

// interpret the CIGAR string and add the base info
for (final AlignmentBlock alignmentBlock : rec.getAlignmentBlocks()) {
final int readStart = alignmentBlock.getReadStart();
final int refStart = alignmentBlock.getReferenceStart();
final int readStart = alignmentBlock.getReadStart();
final int refStart = alignmentBlock.getReferenceStart();
final int blockLength = alignmentBlock.getLength();

for (int i=0; i<blockLength; ++i) {
for (int i = 0; i < blockLength; ++i) {
// 0-based offset into the read of the current base
final int readOffset = readStart + i - 1;

// 0-based offset from the aligned position of the first base in the read to the aligned position of the current base.
final int refOffset = refStart + i - alignmentStart;

// if the quality score cutoff is met, accumulate the base info
if (dontCheckQualities || baseQualities[readOffset] >= minQuality) {
// 0-based offset from the aligned position of the first base in the read to the aligned position of the current base.
final int refOffset = refStart + i - accOffset;
accumulator.get(refOffset).add(rec, readOffset);
}
}
}
}

/**
* Requires that the accumulator for the record is previously fill with
* {@link #accumulateSamRecord(htsjdk.samtools.SAMRecord)}.
* Include in the LocusInfo the indels; the quality threshold does not affect insertions/deletions
*/
private void accumulateIndels(final SAMRecord rec) {
// get the cigar elements
final List<CigarElement> cigar = rec.getCigar().getCigarElements();
// 0-based offset into the read of the current base
int readBase = 0;
// 0-based offset for the reference of the current base
// the accumulator could have the previous position because an indel is accumulating
int refBase = rec.getAlignmentStart() - getAccumulatorOffset(rec);
// iterate over the cigar element
for (int elementIndex = 0; elementIndex < cigar.size(); elementIndex++) {
final CigarElement e = cigar.get(elementIndex);
final CigarOperator operator = e.getOperator();
if (operator.equals(CigarOperator.I)) {
System.err.println("");
// insertions are included in the previous base
accumulator.get(refBase - 1).addInserted(rec, readBase);
readBase += e.getLength();
} else if (operator.equals(CigarOperator.D)) {
// accumulate for each position that spans the deletion
for (int i = 0; i < e.getLength(); i++) {
// the offset is the one for the previous base
accumulator.get(refBase + i).addDeleted(rec, readBase - 1);
}
refBase += e.getLength();
} else {
if (operator.consumesReadBases()) readBase += e.getLength();
if (operator.consumesReferenceBases()) refBase += e.getLength();
}
}
}

/**
* Create the next relevant zero-coverage LocusInfo
*
Expand All @@ -434,12 +520,10 @@ private LocusInfo createNextUncoveredLocusInfo(final Locus stopBeforeLocus) {
}
lastReferenceSequence++;
lastPosition = 0;
}
else if (lastReferenceSequence < stopBeforeLocus.getSequenceIndex() || nextbit < stopBeforeLocus.getPosition()) {
} else if (lastReferenceSequence < stopBeforeLocus.getSequenceIndex() || nextbit < stopBeforeLocus.getPosition()) {
lastPosition = nextbit;
return new LocusInfo(getReferenceSequence(lastReferenceSequence), lastPosition);
}
else if (nextbit >= stopBeforeLocus.getPosition()) {
} else if (nextbit >= stopBeforeLocus.getPosition()) {
return null;
}
}
Expand All @@ -456,9 +540,9 @@ else if (nextbit >= stopBeforeLocus.getPosition()) {
*/
private void populateCompleteQueue(final Locus stopBeforeLocus) {
// Because of gapped alignments, it is possible to create LocusInfo's with no reads associated with them.
// Skip over these.
while (!accumulator.isEmpty() && accumulator.get(0).getRecordAndPositions().isEmpty() &&
locusComparator.compare(accumulator.get(0), stopBeforeLocus) < 0) {
// Skip over these if not including indels
while (!accumulator.isEmpty() && accumulator.get(0).isEmpty() &&
locusComparator.compare(accumulator.get(0), stopBeforeLocus) < 0) {
accumulator.remove(0);
}
if (accumulator.isEmpty()) {
Expand Down Expand Up @@ -494,6 +578,28 @@ private void populateCompleteQueue(final Locus stopBeforeLocus) {
lastPosition = locusInfo.getPosition();
}

/**
* Ensure that the queue is populated and get the accumulator offset for the current record
*/
private int getAccumulatorOffset(SAMRecord rec) {
final SAMSequenceRecord ref = getReferenceSequence(rec.getReferenceIndex());
final int alignmentStart = rec.getAlignmentStart();
final int alignmentEnd = rec.getAlignmentEnd();
final int alignmentLength = alignmentEnd - alignmentStart;
// get the offset for an insertion if we are tracking them
final int insOffset = (includeIndels && startWithInsertion(rec.getCigar())) ? 1 : 0;
// if there is an insertion in the first base and it is not tracked in the accumulator, add it
if (insOffset == 1 && accumulator.isEmpty()) {
accumulator.add(new LocusInfo(ref, alignmentStart - 1));
}

// Ensure there are LocusInfos up to and including this position
for (int i = accumulator.size(); i <= alignmentLength + insOffset; ++i) {
accumulator.add(new LocusInfo(ref, alignmentStart + i - insOffset));
}
return alignmentStart - insOffset;
}

private SAMSequenceRecord getReferenceSequence(final int referenceSequenceIndex) {
return samReader.getFileHeader().getSequence(referenceSequenceIndex);
}
Expand All @@ -516,19 +622,29 @@ public void setSamFilters(final List<SamRecordFilter> samFilters) {
this.samFilters = samFilters;
}

public int getQualityScoreCutoff() { return qualityScoreCutoff; }
public int getQualityScoreCutoff() {
return qualityScoreCutoff;
}

public void setQualityScoreCutoff(final int qualityScoreCutoff) { this.qualityScoreCutoff = qualityScoreCutoff; }
public void setQualityScoreCutoff(final int qualityScoreCutoff) {
this.qualityScoreCutoff = qualityScoreCutoff;
}

public int getMappingQualityScoreCutoff() {
return mappingQualityScoreCutoff;
}

public void setMappingQualityScoreCutoff(final int mappingQualityScoreCutoff) { this.mappingQualityScoreCutoff = mappingQualityScoreCutoff; }
public void setMappingQualityScoreCutoff(final int mappingQualityScoreCutoff) {
this.mappingQualityScoreCutoff = mappingQualityScoreCutoff;
}

public boolean isIncludeNonPfReads() { return includeNonPfReads; }
public boolean isIncludeNonPfReads() {
return includeNonPfReads;
}

public void setIncludeNonPfReads(final boolean includeNonPfReads) { this.includeNonPfReads = includeNonPfReads; }
public void setIncludeNonPfReads(final boolean includeNonPfReads) {
this.includeNonPfReads = includeNonPfReads;
}

public boolean isEmitUncoveredLoci() {
return emitUncoveredLoci;
Expand All @@ -547,6 +663,17 @@ public int getMaxReadsToAccumulatePerLocus() {
* As is pointed out above, setting this could cause major bias because of the non-random nature with which the
* cap is applied (the first maxReadsToAccumulatePerLocus reads are kept and all subsequent ones are dropped).
*/
public void setMaxReadsToAccumulatePerLocus(final int maxReadsToAccumulatePerLocus) { this.maxReadsToAccumulatePerLocus = maxReadsToAccumulatePerLocus; }
public void setMaxReadsToAccumulatePerLocus(final int maxReadsToAccumulatePerLocus) {
this.maxReadsToAccumulatePerLocus = maxReadsToAccumulatePerLocus;
}

public boolean isIncludeIndels() {
return includeIndels;
}

public void setIncludeIndels(final boolean includeIndels) {
this.includeIndels = includeIndels;
}

}

Loading

0 comments on commit 5196d09

Please sign in to comment.