Skip to content

Commit

Permalink
Adding metrics for estimating strand specificity. (#733)
Browse files Browse the repository at this point in the history
* Adding metrics for estimating strand specificity.

Added two metrics that measure the fraction of observed reads that
support R1 being on the strand of transcription versus the opposite
strand of transcription.  For paired end reads, R2 is assumed to be on
the opposite strand of that of R1.

* responding to @tfenne's co
  • Loading branch information
nh13 authored Jan 24, 2017
1 parent 3800c82 commit 1c6cd8a
Show file tree
Hide file tree
Showing 3 changed files with 206 additions and 34 deletions.
30 changes: 30 additions & 0 deletions src/main/java/picard/analysis/RnaSeqMetrics.java
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,36 @@ public class RnaSeqMetrics extends MultilevelMetrics {
/** Number of aligned reads that are mapped to the incorrect strand. 0 if library is not strand-specific. */
public long INCORRECT_STRAND_READS;

/** The number of reads that support the model where R1 is on the strand of transcription and R2 is on the
* opposite strand.
*/
public long NUM_R1_TRANSCRIPT_STRAND_READS;

/**
* The fraction of reads that support the model where R2 is on the strand of transcription and R1 is on the opposite
* strand.
*/
public long NUM_R2_TRANSCRIPT_STRAND_READS;

/**
* The fraction of reads for which the transcription strand model could not be inferred.
*/
public long NUM_UNEXPLAINED_READS;


/** The fraction of reads that support the model where R1 is on the strand of transcription and R2 is on the
* opposite strand. For unpaired reads, it is the fraction of reads that are on the transcription strand (out of all
* the reads).
*/
public double PCT_R1_TRANSCRIPT_STRAND_READS;

/**
* The fraction of reads that support the model where R2 is on the strand of transcription and R1 is on the opposite
* strand. For unpaired reads, it is the fraction of reads that are on opposite strand than that of the the
* transcription strand (out of all the reads).
*/
public double PCT_R2_TRANSCRIPT_STRAND_READS;

/** Fraction of PF_ALIGNED_BASES that mapped to regions encoding ribosomal RNA, RIBOSOMAL_BASES/PF_ALIGNED_BASES */
public Double PCT_RIBOSOMAL_BASES;

Expand Down
85 changes: 63 additions & 22 deletions src/main/java/picard/analysis/directed/RnaSeqMetricsCollector.java
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
package picard.analysis.directed;

import htsjdk.samtools.AlignmentBlock;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.*;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.util.CoordMath;
import htsjdk.samtools.util.Histogram;
Expand Down Expand Up @@ -232,25 +228,64 @@ public void acceptRecord(SAMRecord rec) {

// Strand-specificity is tallied on read basis rather than base at a time. A read that aligns to more than one
// gene is not counted.
if (!rec.getNotPrimaryAlignmentFlag() && overlapsExon && strandSpecificity != StrandSpecificity.NONE && overlappingGenes.size() == 1) {
final boolean negativeTranscriptionStrand = overlappingGenes.iterator().next().isNegativeStrand();
final boolean negativeReadStrand = rec.getReadNegativeStrandFlag();
final boolean readAndTranscriptStrandsAgree = negativeReadStrand == negativeTranscriptionStrand;
final boolean readOneOrUnpaired = !rec.getReadPairedFlag() || rec.getFirstOfPairFlag();
final boolean firstReadExpectedToAgree = strandSpecificity == StrandSpecificity.FIRST_READ_TRANSCRIPTION_STRAND;
final boolean thisReadExpectedToAgree = readOneOrUnpaired == firstReadExpectedToAgree;
// If the read strand is the same as the strand of the transcript, and the end is the one that is supposed to agree,
// then the strand specificity for this read is correct.
// -- OR --
// If the read strand is not the same as the strand of the transcript, and the end is not the one that is supposed
// to agree, then the strand specificity for this read is correct.
if (readAndTranscriptStrandsAgree == thisReadExpectedToAgree) {
++metrics.CORRECT_STRAND_READS;
} else {
++metrics.INCORRECT_STRAND_READS;
if (!rec.getSupplementaryAlignmentFlag() && overlapsExon && overlappingGenes.size() == 1) {
final Gene gene = overlappingGenes.iterator().next();
final boolean negativeTranscriptionStrand = gene.isNegativeStrand();
final boolean readOneOrUnpaired = !rec.getReadPairedFlag() || rec.getFirstOfPairFlag();
final boolean negativeReadStrand = rec.getReadNegativeStrandFlag();
if (strandSpecificity != StrandSpecificity.NONE) {
final boolean readAndTranscriptStrandsAgree = negativeReadStrand == negativeTranscriptionStrand;
final boolean firstReadExpectedToAgree = strandSpecificity == StrandSpecificity.FIRST_READ_TRANSCRIPTION_STRAND;
final boolean thisReadExpectedToAgree = readOneOrUnpaired == firstReadExpectedToAgree;
// If the read strand is the same as the strand of the transcript, and the end is the one that is supposed to agree,
// then the strand specificity for this read is correct.
// -- OR --
// If the read strand is not the same as the strand of the transcript, and the end is not the one that is supposed
// to agree, then the strand specificity for this read is correct.
if (readAndTranscriptStrandsAgree == thisReadExpectedToAgree) {
++metrics.CORRECT_STRAND_READS;
} else {
++metrics.INCORRECT_STRAND_READS;
}
}
}

// Count templates only once rather than individual reads/records
if (readOneOrUnpaired) {
// Check that for paired end reads they are in the proper orientation (FR) and that the entire
// template is enclosed in the gene.
final boolean properOrientation;
final int leftMostAlignedBase, rightMostAlignedBase;
if (rec.getReadPairedFlag()) {
if (rec.getMateUnmappedFlag()) {
properOrientation = false; // mate is unmapped!
leftMostAlignedBase = rightMostAlignedBase = 0;
} else {
// Get the alignment length of the mate. Approximate it with the current read length if no
// mate cigar is found.
final Cigar mateCigar = SAMUtils.getMateCigar(rec);
final int mateReferenceLength = (mateCigar == null) ? rec.getReadLength() : mateCigar.getReferenceLength();
final int mateAlignmentEnd = CoordMath.getEnd(rec.getMateAlignmentStart(), mateReferenceLength);
properOrientation = SamPairUtil.getPairOrientation(rec) == SamPairUtil.PairOrientation.FR;
leftMostAlignedBase = Math.min(rec.getAlignmentStart(), rec.getMateAlignmentStart());
rightMostAlignedBase = Math.max(rec.getAlignmentEnd(), mateAlignmentEnd);
}
} else {
properOrientation = true; // can only be false for paired end reads
leftMostAlignedBase = rec.getAlignmentStart();
rightMostAlignedBase = rec.getAlignmentEnd();
}

if (properOrientation && CoordMath.encloses(gene.getStart(), gene.getEnd(), leftMostAlignedBase, rightMostAlignedBase)) {
if (negativeReadStrand == negativeTranscriptionStrand) {
++metrics.NUM_R1_TRANSCRIPT_STRAND_READS;
} else {
++metrics.NUM_R2_TRANSCRIPT_STRAND_READS;
}
} else {
++metrics.NUM_UNEXPLAINED_READS;
}
}
}
}

protected int getNumAlignedBases(SAMRecord rec) {
Expand All @@ -277,6 +312,12 @@ public void finish() {
if (metrics.CORRECT_STRAND_READS > 0 || metrics.INCORRECT_STRAND_READS > 0) {
metrics.PCT_CORRECT_STRAND_READS = metrics.CORRECT_STRAND_READS/(double)(metrics.CORRECT_STRAND_READS + metrics.INCORRECT_STRAND_READS);
}

final long readsExamined = metrics.NUM_R1_TRANSCRIPT_STRAND_READS + metrics.NUM_R2_TRANSCRIPT_STRAND_READS;
if (0 < readsExamined) {
metrics.PCT_R1_TRANSCRIPT_STRAND_READS = metrics.NUM_R1_TRANSCRIPT_STRAND_READS / (double) readsExamined;
metrics.PCT_R2_TRANSCRIPT_STRAND_READS = metrics.NUM_R2_TRANSCRIPT_STRAND_READS / (double) readsExamined;
}
}

@Override
Expand Down
125 changes: 113 additions & 12 deletions src/test/java/picard/analysis/CollectRnaSeqMetricsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,7 @@
*/
package picard.analysis;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.SAMFileWriterFactory;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordSetBuilder;
import htsjdk.samtools.*;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.IntervalList;
Expand All @@ -48,7 +43,7 @@ public String getCommandLineProgramName() {
}

@Test
public void basic() throws Exception {
public void testBasic() throws Exception {
final String sequence = "chr1";
final String ignoredSequence = "chrM";

Expand Down Expand Up @@ -92,7 +87,7 @@ public void basic() throws Exception {
"REF_FLAT=" + getRefFlatFile(sequence).getAbsolutePath(),
"RIBOSOMAL_INTERVALS=" + rRnaIntervalsFile.getAbsolutePath(),
"STRAND_SPECIFICITY=SECOND_READ_TRANSCRIPTION_STRAND",
"IGNORE_SEQUENCE=" +ignoredSequence
"IGNORE_SEQUENCE=" + ignoredSequence
};
Assert.assertEquals(runPicardCommandLine(args), 0);

Expand All @@ -110,6 +105,11 @@ public void basic() throws Exception {
Assert.assertEquals(metrics.CORRECT_STRAND_READS, 3);
Assert.assertEquals(metrics.INCORRECT_STRAND_READS, 4);
Assert.assertEquals(metrics.IGNORED_READS, 1);
Assert.assertEquals(metrics.NUM_R1_TRANSCRIPT_STRAND_READS, 1);
Assert.assertEquals(metrics.NUM_R2_TRANSCRIPT_STRAND_READS, 2);
Assert.assertEquals(metrics.NUM_UNEXPLAINED_READS, 2);
Assert.assertEquals(metrics.PCT_R1_TRANSCRIPT_STRAND_READS, 0.333333);
Assert.assertEquals(metrics.PCT_R2_TRANSCRIPT_STRAND_READS, 0.666667);
}

@Test
Expand Down Expand Up @@ -165,7 +165,7 @@ public void testMultiLevel() throws Exception {
"REF_FLAT=" + getRefFlatFile(sequence).getAbsolutePath(),
"RIBOSOMAL_INTERVALS=" + rRnaIntervalsFile.getAbsolutePath(),
"STRAND_SPECIFICITY=SECOND_READ_TRANSCRIPTION_STRAND",
"IGNORE_SEQUENCE=" +ignoredSequence,
"IGNORE_SEQUENCE=" + ignoredSequence,
"LEVEL=null",
"LEVEL=SAMPLE",
"LEVEL=LIBRARY"
Expand All @@ -187,6 +187,11 @@ public void testMultiLevel() throws Exception {
Assert.assertEquals(metrics.CORRECT_STRAND_READS, 3);
Assert.assertEquals(metrics.INCORRECT_STRAND_READS, 4);
Assert.assertEquals(metrics.IGNORED_READS, 1);
Assert.assertEquals(metrics.NUM_R1_TRANSCRIPT_STRAND_READS, 1);
Assert.assertEquals(metrics.NUM_R2_TRANSCRIPT_STRAND_READS, 2);
Assert.assertEquals(metrics.NUM_UNEXPLAINED_READS, 2);
Assert.assertEquals(metrics.PCT_R1_TRANSCRIPT_STRAND_READS, 0.333333);
Assert.assertEquals(metrics.PCT_R2_TRANSCRIPT_STRAND_READS, 0.666667);
}
else if (metrics.LIBRARY.equals("foo")) {
Assert.assertEquals(metrics.PF_ALIGNED_BASES, 216);
Expand All @@ -199,7 +204,11 @@ else if (metrics.LIBRARY.equals("foo")) {
Assert.assertEquals(metrics.CORRECT_STRAND_READS, 3);
Assert.assertEquals(metrics.INCORRECT_STRAND_READS, 2);
Assert.assertEquals(metrics.IGNORED_READS, 0);

Assert.assertEquals(metrics.NUM_R1_TRANSCRIPT_STRAND_READS, 0);
Assert.assertEquals(metrics.NUM_R2_TRANSCRIPT_STRAND_READS, 2);
Assert.assertEquals(metrics.NUM_UNEXPLAINED_READS, 1);
Assert.assertEquals(metrics.PCT_R1_TRANSCRIPT_STRAND_READS, 0.0);
Assert.assertEquals(metrics.PCT_R2_TRANSCRIPT_STRAND_READS, 1.0);
}
else if (metrics.LIBRARY.equals("bar")) {
Assert.assertEquals(metrics.PF_ALIGNED_BASES, 180);
Expand All @@ -212,11 +221,104 @@ else if (metrics.LIBRARY.equals("bar")) {
Assert.assertEquals(metrics.CORRECT_STRAND_READS, 0);
Assert.assertEquals(metrics.INCORRECT_STRAND_READS, 2);
Assert.assertEquals(metrics.IGNORED_READS, 1);

Assert.assertEquals(metrics.NUM_R1_TRANSCRIPT_STRAND_READS, 1);
Assert.assertEquals(metrics.NUM_R2_TRANSCRIPT_STRAND_READS, 0);
Assert.assertEquals(metrics.NUM_UNEXPLAINED_READS, 1);
Assert.assertEquals(metrics.PCT_R1_TRANSCRIPT_STRAND_READS, 1.0);
Assert.assertEquals(metrics.PCT_R2_TRANSCRIPT_STRAND_READS, 0.0);
}
}
}

@Test
public void testTranscriptionStrandMetrics() throws Exception {
final String sequence = "chr1";
final String ignoredSequence = "chrM";

// Create some alignments that hit the ribosomal sequence, various parts of the gene, and intergenic.
final SAMRecordSetBuilder builder = new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.coordinate);
// Set seed so that strandedness is consistent among runs.
builder.setRandomSeed(0);
builder.setReadLength(50);
final int sequenceIndex = builder.getHeader().getSequenceIndex(sequence);

// Reads that start *before* the gene: count as unexamined
builder.addPair("pair_prior_1", sequenceIndex, 45, 50, false, false, "50M", "50M", true, false, -1);
builder.addPair("pair_prior_2", sequenceIndex, 49, 50, false, false, "50M", "50M", true, false, -1);

// Read that is enclosed in the gene, but one end does not map: count as unexamined
builder.addPair("read_one_end_unmapped", sequenceIndex, 50, 51, false, true, "50M", null, false, false, -1);

// Reads that are enclosed in the gene, paired and frag: one count per template
builder.addFrag("frag_enclosed_forward", sequenceIndex, 150, false);
builder.addFrag("frag_enclosed_reverse", sequenceIndex, 150, true);
builder.addPair("pair_enclosed_forward", sequenceIndex, 150, 200, false, false, "50M", "50M", false, true, -1);
builder.addPair("pair_enclosed_reverse", sequenceIndex, 200, 150, false, false, "50M", "50M", true, false, -1);

// Read that starts *after* the gene: not counted
builder.addPair("pair_after_1", sequenceIndex, 545, 550, false, false, "50M", "50M", true, false, -1);
builder.addPair("pair_after_2", sequenceIndex, 549, 550, false, false, "50M", "50M", true, false, -1);

// A read that uses the read length instead of the mate cigar
builder.addFrag("frag_enclosed_forward_no_mate_cigar", sequenceIndex, 150, false).setAttribute(SAMTag.MC.name(), null);

// Duplicate reads are counted
builder.addFrag("frag_duplicate", sequenceIndex, 150, false).setDuplicateReadFlag(true);

// These reads should be ignored.
builder.addFrag("frag_secondary", sequenceIndex, 150, false).setNotPrimaryAlignmentFlag(true);
builder.addFrag("frag_supplementary", sequenceIndex, 150, false).setSupplementaryAlignmentFlag(true);
builder.addFrag("frag_qc_failure", sequenceIndex, 150, false).setReadFailsVendorQualityCheckFlag(true);

final File samFile = File.createTempFile("tmp.collectRnaSeqMetrics.", ".sam");
samFile.deleteOnExit();

final SAMFileWriter samWriter = new SAMFileWriterFactory().makeSAMWriter(builder.getHeader(), false, samFile);
for (final SAMRecord rec: builder.getRecords()) samWriter.addAlignment(rec);
samWriter.close();

// Create an interval list with one ribosomal interval.
final Interval rRnaInterval = new Interval(sequence, 300, 520, true, "rRNA");
final IntervalList rRnaIntervalList = new IntervalList(builder.getHeader());
rRnaIntervalList.add(rRnaInterval);
final File rRnaIntervalsFile = File.createTempFile("tmp.rRna.", ".interval_list");
rRnaIntervalsFile.deleteOnExit();
rRnaIntervalList.write(rRnaIntervalsFile);

// Generate the metrics.
final File metricsFile = File.createTempFile("tmp.", ".rna_metrics");
metricsFile.deleteOnExit();

final String[] args = new String[] {
"INPUT=" + samFile.getAbsolutePath(),
"OUTPUT=" + metricsFile.getAbsolutePath(),
"REF_FLAT=" + getRefFlatFile(sequence).getAbsolutePath(),
"RIBOSOMAL_INTERVALS=" + rRnaIntervalsFile.getAbsolutePath(),
"STRAND_SPECIFICITY=SECOND_READ_TRANSCRIPTION_STRAND",
"IGNORE_SEQUENCE=" + ignoredSequence
};
Assert.assertEquals(runPicardCommandLine(args), 0);

final MetricsFile<RnaSeqMetrics, Comparable<?>> output = new MetricsFile<RnaSeqMetrics, Comparable<?>>();
output.read(new FileReader(metricsFile));

final RnaSeqMetrics metrics = output.getMetrics().get(0);
Assert.assertEquals(metrics.PF_ALIGNED_BASES, 900);
Assert.assertEquals(metrics.PF_BASES, 950);
Assert.assertEquals(metrics.RIBOSOMAL_BASES.longValue(), 0L);
Assert.assertEquals(metrics.CODING_BASES, 471);
Assert.assertEquals(metrics.UTR_BASES, 125);
Assert.assertEquals(metrics.INTRONIC_BASES, 98);
Assert.assertEquals(metrics.INTERGENIC_BASES, 206);
Assert.assertEquals(metrics.CORRECT_STRAND_READS, 7);
Assert.assertEquals(metrics.INCORRECT_STRAND_READS, 6);
Assert.assertEquals(metrics.IGNORED_READS, 0);
Assert.assertEquals(metrics.NUM_R1_TRANSCRIPT_STRAND_READS, 4);
Assert.assertEquals(metrics.NUM_R2_TRANSCRIPT_STRAND_READS, 2);
Assert.assertEquals(metrics.NUM_UNEXPLAINED_READS, 3);
Assert.assertEquals(metrics.PCT_R1_TRANSCRIPT_STRAND_READS, 0.666667);
Assert.assertEquals(metrics.PCT_R2_TRANSCRIPT_STRAND_READS, 0.333333);
}

public File getRefFlatFile(String sequence) throws Exception {
// Create a refFlat file with a single gene containing two exons, one of which is overlapped by the
Expand All @@ -242,5 +344,4 @@ public File getRefFlatFile(String sequence) throws Exception {

return refFlatFile;
}

}

0 comments on commit 1c6cd8a

Please sign in to comment.