Skip to content

Commit

Permalink
- case-sensitivity of NM fixed. now tests pass. also refactored code …
Browse files Browse the repository at this point in the history
…so that variables have meaningful names.
  • Loading branch information
yfarjoun authored and Yossi Farjoun committed Oct 24, 2016
1 parent 812d752 commit 17fb671
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 51 deletions.
81 changes: 39 additions & 42 deletions src/main/java/htsjdk/samtools/util/SequenceUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -903,66 +903,63 @@ public static void calculateMdAndNmTags(final SAMRecord record, final byte[] ref
final Cigar cigar = record.getCigar();
final List<CigarElement> cigarElements = cigar.getCigarElements();
final byte[] seq = record.getReadBases();
final int start = record.getAlignmentStart() - 1;
int i, x, y, u = 0;
int nm = 0;
final StringBuilder str = new StringBuilder();

final int size = cigarElements.size();
for (i = y = 0, x = start; i < size; ++i) {
final CigarElement ce = cigarElements.get(i);
int j;
final int length = ce.getLength();
final int alignmentStart = record.getAlignmentStart() - 1;
int cigarIndex, blockRefPos, blockReadStart, matchCount = 0;
int nmCount = 0;
final StringBuilder mdString = new StringBuilder();

final int nElements = cigarElements.size();
for (cigarIndex = blockReadStart = 0, blockRefPos = alignmentStart; cigarIndex < nElements; ++cigarIndex) {
final CigarElement ce = cigarElements.get(cigarIndex);
int inBlockOffset;
final int blockLength = ce.getLength();
final CigarOperator op = ce.getOperator();
if (op == CigarOperator.MATCH_OR_MISMATCH || op == CigarOperator.EQ
|| op == CigarOperator.X) {
for (j = 0; j < length; ++j) {
final int z = y + j;
for (inBlockOffset = 0; inBlockOffset < blockLength; ++inBlockOffset) {
final int readOffset = blockReadStart + inBlockOffset;

if (ref.length <= x + j) break; // out of boundary
if (ref.length <= blockRefPos + inBlockOffset) break; // out of boundary

int c1 = 0;
int c2 = 0;
// try {
c1 = seq[z];
c2 = ref[x + j];
final byte readBase = seq[readOffset];
final byte refBase = ref[blockRefPos + inBlockOffset];

if ((c1 == c2) || c1 == 0) {
if ((bases[readBase] == bases[refBase]) || readBase == 0) {
// a match
++u;
++matchCount;
} else {
str.append(u);
str.appendCodePoint(ref[x + j]);
u = 0;
++nm;
mdString.append(matchCount);
mdString.appendCodePoint(refBase);
matchCount = 0;
++nmCount;
}
}
if (j < length) break;
x += length;
y += length;
if (inBlockOffset < blockLength) break;
blockRefPos += blockLength;
blockReadStart += blockLength;
} else if (op == CigarOperator.DELETION) {
str.append(u);
str.append('^');
for (j = 0; j < length; ++j) {
if (ref[x + j] == 0) break;
str.appendCodePoint(ref[x + j]);
mdString.append(matchCount);
mdString.append('^');
for (inBlockOffset = 0; inBlockOffset < blockLength; ++inBlockOffset) {
if (ref[blockRefPos + inBlockOffset] == 0) break;
mdString.appendCodePoint(ref[blockRefPos + inBlockOffset]);
}
u = 0;
if (j < length) break;
x += length;
nm += length;
matchCount = 0;
if (inBlockOffset < blockLength) break;
blockRefPos += blockLength;
nmCount += blockLength;
} else if (op == CigarOperator.INSERTION
|| op == CigarOperator.SOFT_CLIP) {
y += length;
if (op == CigarOperator.INSERTION) nm += length;
blockReadStart += blockLength;
if (op == CigarOperator.INSERTION) nmCount += blockLength;
} else if (op == CigarOperator.SKIPPED_REGION) {
x += length;
blockRefPos += blockLength;
}
}
str.append(u);
mdString.append(matchCount);

if (calcMD) record.setAttribute(SAMTag.MD.name(), str.toString());
if (calcNM) record.setAttribute(SAMTag.NM.name(), nm);
if (calcMD) record.setAttribute(SAMTag.MD.name(), mdString.toString());
if (calcNM) record.setAttribute(SAMTag.NM.name(), nmCount);
}

public static byte upperCase(final byte base) {
Expand Down
9 changes: 5 additions & 4 deletions src/test/java/htsjdk/samtools/util/SequenceUtilTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -441,12 +441,13 @@ public void testCalculateNmTag() {
reader.iterator().stream().forEach(r -> {
Integer nm = SequenceUtil.calculateSamNmTag(r, ref.getSequence(r.getContig()).getBases());
String md = r.getStringAttribute(SAMTag.MD.name());
Assert.assertEquals(r.getIntegerAttribute(SAMTag.NM.name()), nm, "problem with read \'" + r.getReadName() + "\':");
Assert.assertEquals(r.getIntegerAttribute(SAMTag.NM.name()), nm, "problem with NM in read \'" + r.getReadName() + "\':");
SequenceUtil.calculateMdAndNmTags(r, ref.getSequence(r.getContig()).getBases(), true, true);

Assert.assertEquals(r.getIntegerAttribute(SAMTag.NM.name()), nm, "problem with read \'" + r.getReadName() + "\':");
Assert.assertEquals(r.getStringAttribute(SAMTag.MD.name()), md, "problem with read \'" + r.getReadName() + "\':");

Assert.assertEquals(r.getIntegerAttribute(SAMTag.NM.name()), nm, "problem with NM in read \'" + r.getReadName() + "\':");
if (md != null) {
Assert.assertEquals(r.getStringAttribute(SAMTag.MD.name()), md, "problem with MD in read \'" + r.getReadName() + "\':");
}
});
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
@SQ SN:chr1 LN:16 M5:56b74a652b3ed2f610263b8bb423167c UR:file:src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta
@SQ SN:chr2 LN:16 M5:b835d2c026aa66c52a05838dcc0b59d4 UR:file:src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta
@CO chr1 value is ACGTACGTacgtacgt
@CO chr1 value is TCGATCGAtcgatcga
@CO chr2 value is TCGATCGAtcgatcga
read1 0 chr1 1 0 16M * 0 0 AcGtAcGTaCGtAcGt AAAAAAAAAAAAAAAA NM:i:0
read2 0 chr2 1 0 16M * 0 0 AcGtAcGTaCGtAcGt AAAAAAAAAAAAAAAA NM:i:8 MD:Z:3t0A2T0a2G0t0A2t
read3 0 chr2 1 0 8M * 0 0 TCGATCGA AAAAAAAA NM:i:0
read4 0 chr2 1 0 4M1D2M1S * 0 0 TCGACGAA AAAAAAAA NM:i:1 MD:Z:4^T2

read2 0 chr1 1 0 16M * 0 0 AcGtAcGTaCGtAcGt AAAAAAAAAAAAAAAA NM:i:0
read3 0 chr2 1 0 16M * 0 0 AcGtAcGTaCGtAcGt AAAAAAAAAAAAAAAA NM:i:8 MD:Z:0T2A0T2A0t2a0t2a0
read4 0 chr2 1 0 8M * 0 0 TCGATCGA AAAAAAAA NM:i:0
read5 0 chr2 1 0 4M1D2M1S * 0 0 TCGACGAA AAAAAAAA NM:i:1 MD:Z:4^T2

0 comments on commit 17fb671

Please sign in to comment.