Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added indel support to SamLocusIterator #408

Merged
merged 5 commits into from
May 29, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions src/main/java/htsjdk/samtools/SAMRecordSetBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -217,12 +217,11 @@ private SAMRecord createReadNoFlag(final String name, final int contig, final in
final int defaultQuality) throws SAMException {
final SAMRecord rec = new SAMRecord(this.header);
rec.setReadName(name);
if (chroms.length <= contig) {
throw new SAMException("Contig too big [" + chroms.length + " < " + contig);
if (header.getSequenceDictionary().size() <= contig) {
throw new SAMException("Contig too big [" + header.getSequenceDictionary().size() + " < " + contig);
}
if (0 <= contig) {
rec.setReferenceIndex(contig);
rec.setReferenceName(chroms[contig]);
rec.setAlignmentStart(start);
}
if (!recordUnmapped) {
Expand Down
229 changes: 178 additions & 51 deletions src/main/java/htsjdk/samtools/util/SamLocusIterator.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,23 +23,10 @@
*/
package htsjdk.samtools.util;

import htsjdk.samtools.AlignmentBlock;
import htsjdk.samtools.SAMException;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.filter.AggregateFilter;
import htsjdk.samtools.filter.DuplicateReadFilter;
import htsjdk.samtools.filter.FilteringSamIterator;
import htsjdk.samtools.filter.SamRecordFilter;
import htsjdk.samtools.filter.SecondaryOrSupplementaryFilter;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import htsjdk.samtools.*;
import htsjdk.samtools.filter.*;

import java.util.*;

/**
* Iterator that traverses a SAM File, accumulating information on a per-locus basis.
Expand Down Expand Up @@ -75,12 +62,16 @@ public RecordAndOffset(final SAMRecord record, final int offset) {

/**
* The unit of iteration. Holds information about the locus (the SAMSequenceRecord and 1-based position
* on the reference), plus List of ReadAndOffset objects, one for each read that overlaps the locus
* on the reference), plus List of ReadAndOffset objects, one for each read that overlaps the locus;
* two more List_s_ of ReadAndOffset objects include reads that overlap the locus with insertions and deletions
* respectively
*/
public static final class LocusInfo implements Locus {
private final SAMSequenceRecord referenceSequence;
private final int position;
private final List<RecordAndOffset> recordAndOffsets = new ArrayList<RecordAndOffset>(100);
private List<RecordAndOffset> deletedInRecord = null;
private List<RecordAndOffset> insertedInRecord = null;

LocusInfo(final SAMSequenceRecord referenceSequence, final int position) {
this.referenceSequence = referenceSequence;
Expand All @@ -92,6 +83,25 @@ public void add(final SAMRecord read, final int position) {
recordAndOffsets.add(new RecordAndOffset(read, position));
}

/** Accumulate info for one read with a deletion */
public void addDeleted(final SAMRecord read, int previousPosition) {
if (deletedInRecord == null) {
deletedInRecord = new ArrayList<>();
}
deletedInRecord.add(new RecordAndOffset(read, previousPosition));
}

/**
* Accumulate info for one read with an insertion.
* For this locus, the reads in the insertion are included also in recordAndOffsets
*/
public void addInserted(final SAMRecord read, int firstPosition) {
if (insertedInRecord == null) {
insertedInRecord = new ArrayList<>();
}
insertedInRecord.add(new RecordAndOffset(read, firstPosition));
}

public int getSequenceIndex() { return referenceSequence.getSequenceIndex(); }

/** @return 1-based reference position */
Expand All @@ -100,6 +110,22 @@ public void add(final SAMRecord read, final int position) {
public String getSequenceName() { return referenceSequence.getSequenceName(); }
@Override public String toString() { return referenceSequence.getSequenceName() + ":" + position; }
public int getSequenceLength() {return referenceSequence.getSequenceLength();}
public List<RecordAndOffset> getDeletedInRecord() {
return (deletedInRecord == null) ? Collections.emptyList() : Collections.unmodifiableList(deletedInRecord);
}
public List<RecordAndOffset> getInsertedInRecord() {
return (insertedInRecord == null) ? Collections.emptyList() : Collections.unmodifiableList(insertedInRecord);
}

/**
* @return <code>true</code> if all the RecordAndOffset lists are empty;
* <code>false</code> if at least one have records
*/
public boolean isEmpty() {
return recordAndOffsets.isEmpty() &&
(deletedInRecord == null || deletedInRecord.isEmpty()) &&
(insertedInRecord == null || insertedInRecord.isEmpty());
}
}


Expand Down Expand Up @@ -155,6 +181,12 @@ public void add(final SAMRecord read, final int position) {
// Set to true when we have enforced the accumulation limit for the first time
private boolean enforcedAccumulationLimit = false;

/**
* If true, include indels in the LocusInfo
*/
private boolean includeIndels = false;


// When there is a target mask, these members remember the last locus for which a LocusInfo has been
// returned, so that any uncovered locus in the target mask can be covered by a 0-coverage LocusInfo
private int lastReferenceSequence = 0;
Expand Down Expand Up @@ -300,8 +332,14 @@ public LocusInfo next() {
continue;
}

final Locus alignmentStart = new LocusImpl(rec.getReferenceIndex(), rec.getAlignmentStart());

int start = rec.getAlignmentStart();
// only if we are including indels and the record does not start in the first base of the reference
// the stop locus to populate the queue is not the same if the record starts with an insertion
if (includeIndels && start != 1 && startWithInsertion(rec.getCigar())) {
// the start to populate is one less
start--;
}
final Locus alignmentStart = new LocusImpl(rec.getReferenceIndex(), start);
// emit everything that is before the start of the current read, because we know no more
// coverage will be accumulated for those loci.
while (!accumulator.isEmpty() && locusComparator.compare(accumulator.get(0), alignmentStart) < 0) {
Expand All @@ -316,17 +354,22 @@ public LocusInfo next() {
}

// at this point, either the accumulator list is empty or the head should
// be the same position as the first base of the read
// be the same position as the first base of the read (or insertion if first)
if (!accumulator.isEmpty()) {
if (accumulator.get(0).getSequenceIndex() != rec.getReferenceIndex() ||
accumulator.get(0).position != rec.getAlignmentStart()) {
accumulator.get(0).position != start) {
throw new IllegalStateException("accumulator should be empty or aligned with current SAMRecord");
}
}

// Store the loci for the read in the accumulator
if (!surpassedAccumulationThreshold()) accumulateSamRecord(rec);

if (!surpassedAccumulationThreshold()) {
accumulateSamRecord(rec);
// Store the indels if requested
if (includeIndels) {
accumulateIndels(rec);
}
}
samIterator.next();
}

Expand Down Expand Up @@ -368,46 +411,89 @@ private boolean surpassedAccumulationThreshold() {
return surpassesThreshold;
}

/**
* Check if cigar start with an insertion, ignoring other operators that do not consume references bases
* @param cigar the cigar
* @return <code>true</code> if the first operator to consume reference bases or be an insertion, is an insertion; <code>false</code> otherwise
*/
private static boolean startWithInsertion(final Cigar cigar) {
for (final CigarElement element : cigar.getCigarElements()) {
if (element.getOperator()==CigarOperator.I) return true;
if (!element.getOperator().consumesReferenceBases()) continue;
break;
}
return false;
}

/**
* Capture the loci covered by the given SAMRecord in the LocusInfos in the accumulator,
* creating new LocusInfos as needed.
*/
private void accumulateSamRecord(final SAMRecord rec) {
final SAMSequenceRecord ref = getReferenceSequence(rec.getReferenceIndex());
final int alignmentStart = rec.getAlignmentStart();
final int alignmentEnd = rec.getAlignmentEnd();
final int alignmentLength = alignmentEnd - alignmentStart;

// Ensure there are LocusInfos up to and including this position
for (int i=accumulator.size(); i<=alignmentLength; ++i) {
accumulator.add(new LocusInfo(ref, alignmentStart + i));
}
// get the accumulator offset
int accOffset = getAccumulatorOffset(rec);

final int minQuality = getQualityScoreCutoff();
final boolean dontCheckQualities = minQuality == 0;
final byte[] baseQualities = dontCheckQualities ? null : rec.getBaseQualities();

// interpret the CIGAR string and add the base info
for (final AlignmentBlock alignmentBlock : rec.getAlignmentBlocks()) {
final int readStart = alignmentBlock.getReadStart();
final int refStart = alignmentBlock.getReferenceStart();
final int readStart = alignmentBlock.getReadStart();
final int refStart = alignmentBlock.getReferenceStart();
final int blockLength = alignmentBlock.getLength();

for (int i=0; i<blockLength; ++i) {
for (int i = 0; i < blockLength; ++i) {
// 0-based offset into the read of the current base
final int readOffset = readStart + i - 1;

// 0-based offset from the aligned position of the first base in the read to the aligned position of the current base.
final int refOffset = refStart + i - alignmentStart;

// if the quality score cutoff is met, accumulate the base info
if (dontCheckQualities || baseQualities[readOffset] >= minQuality) {
// 0-based offset from the aligned position of the first base in the read to the aligned position of the current base.
final int refOffset = refStart + i - accOffset;
accumulator.get(refOffset).add(rec, readOffset);
}
}
}
}

/**
* Requires that the accumulator for the record is previously fill with
* {@link #accumulateSamRecord(htsjdk.samtools.SAMRecord)}.
* Include in the LocusInfo the indels; the quality threshold does not affect insertions/deletions
*/
private void accumulateIndels(final SAMRecord rec) {
// get the cigar elements
final List<CigarElement> cigar = rec.getCigar().getCigarElements();
// 0-based offset into the read of the current base
int readBase = 0;
// 0-based offset for the reference of the current base
// the accumulator could have the previous position because an indel is accumulating
int refBase = rec.getAlignmentStart() - getAccumulatorOffset(rec);
// iterate over the cigar element
for (int elementIndex = 0; elementIndex < cigar.size(); elementIndex++) {
final CigarElement e = cigar.get(elementIndex);
final CigarOperator operator = e.getOperator();
if (operator.equals(CigarOperator.I)) {
System.err.println("");
// insertions are included in the previous base
accumulator.get(refBase - 1).addInserted(rec, readBase);
readBase += e.getLength();
} else if (operator.equals(CigarOperator.D)) {
// accumulate for each position that spans the deletion
for (int i = 0; i < e.getLength(); i++) {
// the offset is the one for the previous base
accumulator.get(refBase + i).addDeleted(rec, readBase - 1);
}
refBase += e.getLength();
} else {
if (operator.consumesReadBases()) readBase += e.getLength();
if (operator.consumesReferenceBases()) refBase += e.getLength();
}
}
}

/**
* Create the next relevant zero-coverage LocusInfo
*
Expand All @@ -434,12 +520,10 @@ private LocusInfo createNextUncoveredLocusInfo(final Locus stopBeforeLocus) {
}
lastReferenceSequence++;
lastPosition = 0;
}
else if (lastReferenceSequence < stopBeforeLocus.getSequenceIndex() || nextbit < stopBeforeLocus.getPosition()) {
} else if (lastReferenceSequence < stopBeforeLocus.getSequenceIndex() || nextbit < stopBeforeLocus.getPosition()) {
lastPosition = nextbit;
return new LocusInfo(getReferenceSequence(lastReferenceSequence), lastPosition);
}
else if (nextbit >= stopBeforeLocus.getPosition()) {
} else if (nextbit >= stopBeforeLocus.getPosition()) {
return null;
}
}
Expand All @@ -456,9 +540,9 @@ else if (nextbit >= stopBeforeLocus.getPosition()) {
*/
private void populateCompleteQueue(final Locus stopBeforeLocus) {
// Because of gapped alignments, it is possible to create LocusInfo's with no reads associated with them.
// Skip over these.
while (!accumulator.isEmpty() && accumulator.get(0).getRecordAndPositions().isEmpty() &&
locusComparator.compare(accumulator.get(0), stopBeforeLocus) < 0) {
// Skip over these if not including indels
while (!accumulator.isEmpty() && accumulator.get(0).isEmpty() &&
locusComparator.compare(accumulator.get(0), stopBeforeLocus) < 0) {
accumulator.remove(0);
}
if (accumulator.isEmpty()) {
Expand Down Expand Up @@ -494,6 +578,28 @@ private void populateCompleteQueue(final Locus stopBeforeLocus) {
lastPosition = locusInfo.getPosition();
}

/**
* Ensure that the queue is populated and get the accumulator offset for the current record
*/
private int getAccumulatorOffset(SAMRecord rec) {
final SAMSequenceRecord ref = getReferenceSequence(rec.getReferenceIndex());
final int alignmentStart = rec.getAlignmentStart();
final int alignmentEnd = rec.getAlignmentEnd();
final int alignmentLength = alignmentEnd - alignmentStart;
// get the offset for an insertion if we are tracking them
final int insOffset = (includeIndels && startWithInsertion(rec.getCigar())) ? 1 : 0;
// if there is an insertion in the first base and it is not tracked in the accumulator, add it
if (insOffset == 1 && accumulator.isEmpty()) {
accumulator.add(new LocusInfo(ref, alignmentStart - 1));
}

// Ensure there are LocusInfos up to and including this position
for (int i = accumulator.size(); i <= alignmentLength + insOffset; ++i) {
accumulator.add(new LocusInfo(ref, alignmentStart + i - insOffset));
}
return alignmentStart - insOffset;
}

private SAMSequenceRecord getReferenceSequence(final int referenceSequenceIndex) {
return samReader.getFileHeader().getSequence(referenceSequenceIndex);
}
Expand All @@ -516,19 +622,29 @@ public void setSamFilters(final List<SamRecordFilter> samFilters) {
this.samFilters = samFilters;
}

public int getQualityScoreCutoff() { return qualityScoreCutoff; }
public int getQualityScoreCutoff() {
return qualityScoreCutoff;
}

public void setQualityScoreCutoff(final int qualityScoreCutoff) { this.qualityScoreCutoff = qualityScoreCutoff; }
public void setQualityScoreCutoff(final int qualityScoreCutoff) {
this.qualityScoreCutoff = qualityScoreCutoff;
}

public int getMappingQualityScoreCutoff() {
return mappingQualityScoreCutoff;
}

public void setMappingQualityScoreCutoff(final int mappingQualityScoreCutoff) { this.mappingQualityScoreCutoff = mappingQualityScoreCutoff; }
public void setMappingQualityScoreCutoff(final int mappingQualityScoreCutoff) {
this.mappingQualityScoreCutoff = mappingQualityScoreCutoff;
}

public boolean isIncludeNonPfReads() { return includeNonPfReads; }
public boolean isIncludeNonPfReads() {
return includeNonPfReads;
}

public void setIncludeNonPfReads(final boolean includeNonPfReads) { this.includeNonPfReads = includeNonPfReads; }
public void setIncludeNonPfReads(final boolean includeNonPfReads) {
this.includeNonPfReads = includeNonPfReads;
}

public boolean isEmitUncoveredLoci() {
return emitUncoveredLoci;
Expand All @@ -547,6 +663,17 @@ public int getMaxReadsToAccumulatePerLocus() {
* As is pointed out above, setting this could cause major bias because of the non-random nature with which the
* cap is applied (the first maxReadsToAccumulatePerLocus reads are kept and all subsequent ones are dropped).
*/
public void setMaxReadsToAccumulatePerLocus(final int maxReadsToAccumulatePerLocus) { this.maxReadsToAccumulatePerLocus = maxReadsToAccumulatePerLocus; }
public void setMaxReadsToAccumulatePerLocus(final int maxReadsToAccumulatePerLocus) {
this.maxReadsToAccumulatePerLocus = maxReadsToAccumulatePerLocus;
}

public boolean isIncludeIndels() {
return includeIndels;
}

public void setIncludeIndels(final boolean includeIndels) {
this.includeIndels = includeIndels;
}

}

Loading