Skip to content

Commit

Permalink
Only compute BIN on BAM write and on index building (#1258)
Browse files Browse the repository at this point in the history
  • Loading branch information
tfenne authored Jan 16, 2019
1 parent 94f0967 commit 15ec7da
Show file tree
Hide file tree
Showing 12 changed files with 67 additions and 178 deletions.
6 changes: 1 addition & 5 deletions src/main/java/htsjdk/samtools/BAMIndexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -231,11 +231,7 @@ public int getEnd() {
}

@Override
public Integer getIndexingBin() {
final Integer binNumber = rec.getIndexingBin();
return (binNumber == null ? rec.computeIndexingBin() : binNumber);

}
public Integer getIndexingBin() { return rec.computeIndexingBin(); }

@Override
public Chunk getChunk() {
Expand Down
3 changes: 0 additions & 3 deletions src/main/java/htsjdk/samtools/BAMRecord.java
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,6 @@ protected BAMRecord(final SAMFileHeader header,
super.setReadBases(null);
super.setBaseQualities(null);

// Do this after the above because setCigarString will clear it.
setIndexingBin(indexingBin);

// Mark the binary block as being valid for writing back out to disk
mBinaryDataStale = false;
}
Expand Down
48 changes: 31 additions & 17 deletions src/main/java/htsjdk/samtools/BAMRecordCodec.java
Original file line number Diff line number Diff line change
Expand Up @@ -107,19 +107,22 @@ public void encode(final SAMRecord alignment) {
// Compute block size, as it is the first element of the file representation of SAMRecord
final int readLength = alignment.getReadLength();

// if cigar is too long, put into CG tag and replace with sentinel value
if (alignment.getCigarLength() > BAMRecord.MAX_CIGAR_OPERATORS) {
// If cigar is too long, put into CG tag and replace with sentinel value.
// Using alignment.getCigarLength() here causes problems, so access the cigar instead
final Cigar cigarToWrite;
final boolean cigarSwitcharoo = alignment.getCigar().numCigarElements() > BAMRecord.MAX_CIGAR_OPERATORS;

if (cigarSwitcharoo) {
final int[] cigarEncoding = BinaryCigarCodec.encode(alignment.getCigar());
alignment.setCigar(makeSentinelCigar(alignment.getCigar()));
alignment.setAttribute(CG.name(), cigarEncoding);
cigarToWrite = makeSentinelCigar(alignment.getCigar());
}
else {
cigarToWrite = alignment.getCigar();
}

// do not combine with previous call to alignment.getCigarLength() as cigar may change in-between
final int cigarLength = alignment.getCigarLength();

int blockSize = BAMFileConstants.FIXED_BLOCK_SIZE + alignment.getReadNameLength() + 1 + // null terminated
cigarLength * BAMRecord.CIGAR_SIZE_MULTIPLIER +
cigarToWrite.numCigarElements() * BAMRecord.CIGAR_SIZE_MULTIPLIER +
(readLength + 1) / 2 + // 2 bases per byte, round up
readLength;

Expand All @@ -139,8 +142,9 @@ public void encode(final SAMRecord alignment) {
// the actual cigar.
int indexBin = 0;
if (alignment.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START) {
warnIfReferenceIsTooLargeForBinField(alignment);
indexBin = alignment.computeIndexingBinIfAbsent(alignment);
if (!warnIfReferenceIsTooLargeForBinField(alignment)) {
indexBin = alignment.computeIndexingBin();
}
}

// Blurt out the elements
Expand All @@ -151,7 +155,7 @@ public void encode(final SAMRecord alignment) {
this.binaryCodec.writeUByte((short) (alignment.getReadNameLength() + 1));
this.binaryCodec.writeUByte((short) alignment.getMappingQuality());
this.binaryCodec.writeUShort(indexBin);
this.binaryCodec.writeUShort(cigarLength);
this.binaryCodec.writeUShort(cigarToWrite.numCigarElements());
this.binaryCodec.writeUShort(alignment.getFlags());
this.binaryCodec.writeInt(alignment.getReadLength());
this.binaryCodec.writeInt(alignment.getMateReferenceIndex());
Expand All @@ -170,7 +174,7 @@ public void encode(final SAMRecord alignment) {
"; quals length: " + alignment.getBaseQualities().length);
}
this.binaryCodec.writeString(alignment.getReadName(), false, true);
final int[] binaryCigar = BinaryCigarCodec.encode(alignment.getCigar());
final int[] binaryCigar = BinaryCigarCodec.encode(cigarToWrite);
for (final int cigarElement : binaryCigar) {
// Assumption that this will fit into an integer, despite the fact
// that it is spec'ed as a uint.
Expand All @@ -194,6 +198,10 @@ public void encode(final SAMRecord alignment) {
attribute = attribute.getNext();
}
}

if (cigarSwitcharoo) {
alignment.setAttribute(CG.name(), null);
}
}

/**
Expand Down Expand Up @@ -223,15 +231,21 @@ public static Cigar makeSentinelCigar(final Cigar cigar) {
new CigarElement(cigar.getReferenceLength(), CigarOperator.N)));
}

private void warnIfReferenceIsTooLargeForBinField(final SAMRecord rec) {
/** Emits a warning the first time a reference too large for binning indexing is encountered.
*
* @param rec the SAMRecord to examine
* @return true if the sequence is too large, false otherwise
*/
private boolean warnIfReferenceIsTooLargeForBinField(final SAMRecord rec) {
final SAMSequenceRecord sequence = rec.getHeader() != null ? rec.getHeader().getSequence(rec.getReferenceName()) : null;
if (!isReferenceSizeWarningShowed
&& sequence != null
&& SAMUtils.isReferenceSequenceCompatibleWithBAI(sequence)
&& rec.getValidationStringency() != ValidationStringency.SILENT) {
LOG.warn("Reference length is too large for BAM bin field. Values in the bin field could be incorrect.");
final boolean tooLarge = sequence != null && SAMUtils.isReferenceSequenceIncompatibleWithBAI(sequence);
if (!isReferenceSizeWarningShowed && tooLarge && rec.getValidationStringency() != ValidationStringency.SILENT) {
LOG.warn("Reference length is too large for BAM bin field.");
LOG.warn("Reads on references longer than " + GenomicIndexUtil.BIN_GENOMIC_SPAN + "bp will have bin set to 0.");
isReferenceSizeWarningShowed = true;
}

return tooLarge;
}

/**
Expand Down
78 changes: 10 additions & 68 deletions src/main/java/htsjdk/samtools/SAMRecord.java
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,6 @@ public class SAMRecord implements Cloneable, Locatable, Serializable {
private SAMBinaryTagAndValue mAttributes = null;
protected Integer mReferenceIndex = null;
protected Integer mMateReferenceIndex = null;
private Integer mIndexingBin = null;

/**
* Some attributes (e.g. CIGAR) are not decoded immediately. Use this to decide how to validate when decoded.
Expand Down Expand Up @@ -595,8 +594,6 @@ public void setAlignmentStart(final int value) {
mAlignmentStart = value;
// Clear cached alignment end
mAlignmentEnd = NO_ALIGNMENT_START;
// Change to alignmentStart could change indexing bin
setIndexingBin(null);
}

/**
Expand Down Expand Up @@ -806,8 +803,6 @@ public void setCigarString(final String value) {
mAlignmentBlocks = null;
// Clear cached alignment end
mAlignmentEnd = NO_ALIGNMENT_START;
// Change to cigar could change alignmentEnd, and thus indexing bin
setIndexingBin(null);
}

/**
Expand Down Expand Up @@ -844,8 +839,6 @@ public int getCigarLength() {
*/
public void setCigar(final Cigar cigar) {
initializeCigar(cigar);
// Change to cigar could change alignmentEnd, and thus indexing bin
setIndexingBin(null);
}

/**
Expand Down Expand Up @@ -886,8 +879,6 @@ public int getFlags() {

public void setFlags(final int value) {
mFlags = value;
// Could imply change to readUnmapped flag, which could change indexing bin
setIndexingBin(null);
}

/**
Expand Down Expand Up @@ -1043,8 +1034,6 @@ public void setReadUmappedFlag(final boolean flag) {
*/
public void setReadUnmappedFlag(final boolean flag) {
setFlag(flag, SAMFlag.READ_UNMAPPED.flag);
// Change to readUnmapped could change indexing bin
setIndexingBin(null);
}

/**
Expand Down Expand Up @@ -1564,52 +1553,34 @@ public List<SAMTagAndValue> getAttributes() {
return ret;
}

Integer getIndexingBin() {
return mIndexingBin;
}

/**
* Used internally when writing BAMRecords.
* @param mIndexingBin c.f. http://samtools.sourceforge.net/SAM1.pdf
* @deprecated Use computeIndexingBin() if accessible or GenomicIndexUtil.regionToBin() otherwise.
*/
void setIndexingBin(final Integer mIndexingBin) {
this.mIndexingBin = mIndexingBin;
}
@Deprecated() public int computeIndexingBinIfAbsent(final SAMRecord alignment) { return alignment.computeIndexingBin(); }

/**
* Does not change state of this.
* Computes the BAI indexing bin for the record. Invokes getAlignmentEnd() among other methods, which may
* cause the record to deserialize/parse the cigar is necessary.
*
* @return indexing bin based on alignment start & end.
*/
int computeIndexingBin() {
// regionToBin has zero-based, half-open API
final int alignmentStart = getAlignmentStart()-1;
final int alignmentStart = getAlignmentStart() - 1; // BIN uses 0-based half-open
int alignmentEnd = getAlignmentEnd();
if (alignmentEnd <= 0) {
// If alignment end cannot be determined (e.g. because this read is not really aligned),
// then treat this as a one base alignment for indexing purposes.
alignmentEnd = alignmentStart + 1;
}
return GenomicIndexUtil.regionToBin(alignmentStart, alignmentEnd);
}


/**
* Gets indexing bin if it is present otherwise compute it.
* @param alignment to compute indexing bin for.
* @return indexing bin.
*/
public int computeIndexingBinIfAbsent(final SAMRecord alignment) {
if (alignment.getIndexingBin() != null) {
return alignment.getIndexingBin();
} else {
return getUshort(alignment.computeIndexingBin());
if (alignmentStart > GenomicIndexUtil.BIN_GENOMIC_SPAN || alignmentEnd > GenomicIndexUtil.BIN_GENOMIC_SPAN) {
throw new IllegalStateException("Read position too high for BAI bin indexing.");
}
}

private int getUshort(int value) {
return value & (int) BinaryCodec.MAX_USHORT;
return GenomicIndexUtil.regionToBin(alignmentStart, alignmentEnd) & (int) BinaryCodec.MAX_USHORT;
}


/**
* @return the SAMFileHeader for this record. If the header is null, the following SAMRecord methods may throw
* exceptions:
Expand Down Expand Up @@ -1854,8 +1825,6 @@ public boolean equals(final Object o) {
if (mInferredInsertSize != samRecord.mInferredInsertSize) return false;
if (mMappingQuality != samRecord.mMappingQuality) return false;
if (mMateAlignmentStart != samRecord.mMateAlignmentStart) return false;
if (mIndexingBin != null ? !mIndexingBin.equals(samRecord.mIndexingBin) : samRecord.mIndexingBin != null)
return false;
if (mMateReferenceIndex != null ? !mMateReferenceIndex.equals(samRecord.mMateReferenceIndex) : samRecord.mMateReferenceIndex != null)
return false;
if (mReferenceIndex != null ? !mReferenceIndex.equals(samRecord.mReferenceIndex) : samRecord.mReferenceIndex != null)
Expand Down Expand Up @@ -1896,7 +1865,6 @@ public int hashCode() {
result = 31 * result + (mAttributes != null ? mAttributes.hashCode() : 0);
result = 31 * result + (mReferenceIndex != null ? mReferenceIndex.hashCode() : 0);
result = 31 * result + (mMateReferenceIndex != null ? mMateReferenceIndex.hashCode() : 0);
result = 31 * result + (mIndexingBin != null ? mIndexingBin.hashCode() : 0);
return result;
}

Expand Down Expand Up @@ -2112,27 +2080,6 @@ public List<SAMValidationError> isValid(final boolean firstOnly) {
if (firstOnly) return ret;
}

if (this.getAlignmentStart() != NO_ALIGNMENT_START) {
final SAMSequenceRecord sequence = getHeader() != null ? getHeader().getSequence(getReferenceName()) : null;
if (sequence != null && SAMUtils.isReferenceSequenceCompatibleWithBAI(sequence)) {
if (getValidationStringency() != ValidationStringency.SILENT) {
if (getReferenceName() != null) {
LOG.warn(String.format("%s reference length is too large for BAM bin field. %s record bin field value is incorrect.",
getReferenceName(), getReadName()));
} else {
LOG.warn(String.format("Reference length is too large for BAM bin field. %s record bin field value is incorrect.",
getReadName()));
}
}
} else if (isIndexingBinNotEqualsComputedBin()) {
if (ret == null) ret = new ArrayList<>();
ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_INDEXING_BIN,
"bin field of BAM record does not equal value computed based on alignment start and end, and length of sequence to which read is aligned",
getReadName()));
if (firstOnly) return ret;
}
}

if (getMateReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME) &&
getMateAlignmentStart() != SAMRecord.NO_ALIGNMENT_START) {
if (ret == null) ret = new ArrayList<>();
Expand Down Expand Up @@ -2165,10 +2112,6 @@ public List<SAMValidationError> isValid(final boolean firstOnly) {
return ret;
}

private boolean isIndexingBinNotEqualsComputedBin() {
return this.getIndexingBin() != null && this.computeIndexingBin() != this.getIndexingBin();
}

/**
* Gets the source of this SAM record -- both the reader that retrieved the record and the position on disk from
* whence it came.
Expand Down Expand Up @@ -2282,7 +2225,6 @@ public SAMRecord deepCopy() {
if (null != attributes) {
newSAM.setAttributes(attributes.deepCopy());
}
newSAM.setIndexingBin(getIndexingBin());

return newSAM;
}
Expand Down
10 changes: 9 additions & 1 deletion src/main/java/htsjdk/samtools/SAMUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -1250,11 +1250,19 @@ public static List<SAMRecord> getOtherCanonicalAlignments(final SAMRecord record
return alignments;
}

/**
* @deprecated because the method does the exact opposite of what it says. Use the correctly named
* isReferenceSequenceIncompatibleWithBAI() instead.
*/
@Deprecated public static boolean isReferenceSequenceCompatibleWithBAI(final SAMSequenceRecord sequence) {
return isReferenceSequenceIncompatibleWithBAI(sequence);
}

/**
* Checks if reference sequence is compatible with BAI indexing format.
* @param sequence reference sequence.
*/
public static boolean isReferenceSequenceCompatibleWithBAI(final SAMSequenceRecord sequence) {
public static boolean isReferenceSequenceIncompatibleWithBAI(final SAMSequenceRecord sequence) {
return sequence.getSequenceLength() > GenomicIndexUtil.BIN_GENOMIC_SPAN;
}
}
6 changes: 5 additions & 1 deletion src/main/java/htsjdk/samtools/SAMValidationError.java
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,11 @@ public enum Type {
DUPLICATE_SAM_TAG,

/** The CG Tag should only be used in BAM format to hold a large cigar */
CG_TAG_FOUND_IN_ATTRIBUTES;
CG_TAG_FOUND_IN_ATTRIBUTES,

/** One or more reference sequences in the dictionary are too long for BAI indexing. */
REF_SEQ_TOO_LONG_FOR_BAI(Severity.WARNING);


public final Severity severity;

Expand Down
9 changes: 9 additions & 0 deletions src/main/java/htsjdk/samtools/SamFileValidator.java
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

/**
* Validates SAM files as follows:
Expand Down Expand Up @@ -572,6 +573,14 @@ private void validateHeader(final SAMFileHeader fileHeader) {
addError(new SAMValidationError(Type.MISMATCH_FILE_SEQ_DICT, "Mismatch between file and sequence dictionary", null));
}
}

final List<SAMSequenceRecord> longSeqs = fileHeader.getSequenceDictionary().getSequences().stream()
.filter(s -> s.getSequenceLength() > GenomicIndexUtil.BIN_GENOMIC_SPAN).collect(Collectors.toList());

if (!longSeqs.isEmpty()) {
final String msg = "Reference sequences are too long for BAI indexing: " + StringUtil.join(", ", longSeqs);
addError(new SAMValidationError(Type.REF_SEQ_TOO_LONG_FOR_BAI, msg, null));
}
}
if (fileHeader.getReadGroups().isEmpty()) {
addError(new SAMValidationError(Type.MISSING_READ_GROUP, "Read groups is empty", null));
Expand Down
2 changes: 1 addition & 1 deletion src/test/java/htsjdk/samtools/BAMCigarOverflowTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,6 @@ public void testCigarOverflow() throws Exception {

//The BAM file that exposed the bug triggered a SAM validation error because the bin field of the BAM record did not equal the computed value. Here we test for this error.
//Cast to int to avoid an ambiguity in the assertEquals() call between assertEquals(int,int) and assertEquals(Object,Object).
assertEquals(testBAMRecord.computeIndexingBin(), (int) testBAMRecord.getIndexingBin());
assertEquals(testBAMRecord.computeIndexingBin(), 0);
}
}
5 changes: 1 addition & 4 deletions src/test/java/htsjdk/samtools/BAMFileWriterTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,6 @@ private void verifySamReadersEqual(final SamReader reader2, final SamReader read
final SAMRecord samRecord1 = samIt1.next();
final SAMRecord samRecord2 = samIt2.next();

// SAMRecords don't have this set, so stuff it in there
samRecord2.setIndexingBin(samRecord1.getIndexingBin());

// Force reference index attributes to be populated
samRecord1.getReferenceIndex();
samRecord2.getReferenceIndex();
Expand Down Expand Up @@ -342,7 +339,7 @@ public void testBinNotNullWhenLargeCigarIsLoaded(final int numOps) throws Except
try (final SamReader reader = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT).open(bamFile)) {
reader.iterator().forEachRemaining(samRecord -> {
samRecord.getCigar();
Assert.assertNotNull(samRecord.getIndexingBin());
samRecord.computeIndexingBin();
});
}
}
Expand Down
Loading

0 comments on commit 15ec7da

Please sign in to comment.