From 1c6cd8a251e9fa870472ccd8d883dc0cff8f07fc Mon Sep 17 00:00:00 2001
From: Nils Homer <nh13@users.noreply.github.com>
Date: Tue, 24 Jan 2017 12:43:26 -0700
Subject: [PATCH] Adding metrics for estimating strand specificity. (#733)

* Adding metrics for estimating strand specificity.

Added two metrics that measure the fraction of observed reads that
support R1 being on the strand of transcription versus the opposite
strand of transcription.  For paired end reads, R2 is assumed to be on
the opposite strand of that of R1.

* responding to @tfenne's co
---
 .../java/picard/analysis/RnaSeqMetrics.java   |  30 +++++
 .../directed/RnaSeqMetricsCollector.java      |  85 +++++++++---
 .../analysis/CollectRnaSeqMetricsTest.java    | 125 ++++++++++++++++--
 3 files changed, 206 insertions(+), 34 deletions(-)

diff --git a/src/main/java/picard/analysis/RnaSeqMetrics.java b/src/main/java/picard/analysis/RnaSeqMetrics.java
index 07e299e5e3..c0f7f5376f 100644
--- a/src/main/java/picard/analysis/RnaSeqMetrics.java
+++ b/src/main/java/picard/analysis/RnaSeqMetrics.java
@@ -66,6 +66,36 @@ public class RnaSeqMetrics extends MultilevelMetrics {
     /** Number of aligned reads that are mapped to the incorrect strand.  0 if library is not strand-specific. */
     public long INCORRECT_STRAND_READS;
 
+    /** The number of reads that support the model where R1 is on the strand of transcription and R2 is on the
+     * opposite strand.
+     */
+    public long NUM_R1_TRANSCRIPT_STRAND_READS;
+
+    /**
+     * The fraction of reads that support the model where R2 is on the strand of transcription and R1 is on the opposite
+     * strand.
+     */
+    public long NUM_R2_TRANSCRIPT_STRAND_READS;
+
+    /**
+     * The fraction of reads for which the transcription strand model could not be inferred.
+     */
+    public long NUM_UNEXPLAINED_READS;
+
+
+    /** The fraction of reads that support the model where R1 is on the strand of transcription and R2 is on the
+     * opposite strand.  For unpaired reads, it is the fraction of reads that are on the transcription strand (out of all
+     * the reads).
+     */
+    public double PCT_R1_TRANSCRIPT_STRAND_READS;
+
+    /**
+     * The fraction of reads that support the model where R2 is on the strand of transcription and R1 is on the opposite
+     * strand.  For unpaired reads, it is the fraction of reads that are on opposite strand than that of the the
+     * transcription strand (out of all the reads).
+     */
+    public double PCT_R2_TRANSCRIPT_STRAND_READS;
+
     /** Fraction of PF_ALIGNED_BASES that mapped to regions encoding ribosomal RNA, RIBOSOMAL_BASES/PF_ALIGNED_BASES */
     public Double PCT_RIBOSOMAL_BASES;
 
diff --git a/src/main/java/picard/analysis/directed/RnaSeqMetricsCollector.java b/src/main/java/picard/analysis/directed/RnaSeqMetricsCollector.java
index 149e0ffd99..3ffb4ffe28 100644
--- a/src/main/java/picard/analysis/directed/RnaSeqMetricsCollector.java
+++ b/src/main/java/picard/analysis/directed/RnaSeqMetricsCollector.java
@@ -1,10 +1,6 @@
 package picard.analysis.directed;
 
-import htsjdk.samtools.AlignmentBlock;
-import htsjdk.samtools.SAMFileHeader;
-import htsjdk.samtools.SAMReadGroupRecord;
-import htsjdk.samtools.SAMRecord;
-import htsjdk.samtools.SAMSequenceRecord;
+import htsjdk.samtools.*;
 import htsjdk.samtools.metrics.MetricsFile;
 import htsjdk.samtools.util.CoordMath;
 import htsjdk.samtools.util.Histogram;
@@ -232,25 +228,64 @@ public void acceptRecord(SAMRecord rec) {
 
             // Strand-specificity is tallied on read basis rather than base at a time.  A read that aligns to more than one
             // gene is not counted.
-            if (!rec.getNotPrimaryAlignmentFlag() && overlapsExon && strandSpecificity != StrandSpecificity.NONE && overlappingGenes.size() == 1) {
-                final boolean negativeTranscriptionStrand = overlappingGenes.iterator().next().isNegativeStrand();
-                final boolean negativeReadStrand = rec.getReadNegativeStrandFlag();
-                final boolean readAndTranscriptStrandsAgree = negativeReadStrand == negativeTranscriptionStrand;
-                final boolean readOneOrUnpaired = !rec.getReadPairedFlag() || rec.getFirstOfPairFlag();
-                final boolean firstReadExpectedToAgree = strandSpecificity == StrandSpecificity.FIRST_READ_TRANSCRIPTION_STRAND;
-                final boolean thisReadExpectedToAgree = readOneOrUnpaired == firstReadExpectedToAgree;
-                // If the read strand is the same as the strand of the transcript, and the end is the one that is supposed to agree,
-                // then the strand specificity for this read is correct.
-                // -- OR --
-                // If the read strand is not the same as the strand of the transcript, and the end is not the one that is supposed
-                // to agree, then the strand specificity for this read is correct.
-                if (readAndTranscriptStrandsAgree == thisReadExpectedToAgree) {
-                    ++metrics.CORRECT_STRAND_READS;
-                } else {
-                    ++metrics.INCORRECT_STRAND_READS;
+            if (!rec.getSupplementaryAlignmentFlag() && overlapsExon && overlappingGenes.size() == 1) {
+                final Gene gene                           = overlappingGenes.iterator().next();
+                final boolean negativeTranscriptionStrand = gene.isNegativeStrand();
+                final boolean readOneOrUnpaired           = !rec.getReadPairedFlag() || rec.getFirstOfPairFlag();
+                final boolean negativeReadStrand          = rec.getReadNegativeStrandFlag();
+                if (strandSpecificity != StrandSpecificity.NONE) {
+                    final boolean readAndTranscriptStrandsAgree = negativeReadStrand == negativeTranscriptionStrand;
+                    final boolean firstReadExpectedToAgree      = strandSpecificity == StrandSpecificity.FIRST_READ_TRANSCRIPTION_STRAND;
+                    final boolean thisReadExpectedToAgree       = readOneOrUnpaired == firstReadExpectedToAgree;
+                    // If the read strand is the same as the strand of the transcript, and the end is the one that is supposed to agree,
+                    // then the strand specificity for this read is correct.
+                    // -- OR --
+                    // If the read strand is not the same as the strand of the transcript, and the end is not the one that is supposed
+                    // to agree, then the strand specificity for this read is correct.
+                    if (readAndTranscriptStrandsAgree == thisReadExpectedToAgree) {
+                        ++metrics.CORRECT_STRAND_READS;
+                    } else {
+                        ++metrics.INCORRECT_STRAND_READS;
+                    }
                 }
-            }
 
+                // Count templates only once rather than individual reads/records
+                if (readOneOrUnpaired) {
+                    // Check that for paired end reads they are in the proper orientation (FR) and that the entire
+                    // template is enclosed in the gene.
+                    final boolean properOrientation;
+                    final int leftMostAlignedBase, rightMostAlignedBase;
+                    if (rec.getReadPairedFlag()) {
+                        if (rec.getMateUnmappedFlag()) {
+                            properOrientation   = false; // mate is unmapped!
+                            leftMostAlignedBase = rightMostAlignedBase = 0;
+                        } else {
+                            // Get the alignment length of the mate.  Approximate it with the current read length if no
+                            // mate cigar is found.
+                            final Cigar mateCigar         = SAMUtils.getMateCigar(rec);
+                            final int mateReferenceLength = (mateCigar == null) ? rec.getReadLength() : mateCigar.getReferenceLength();
+                            final int mateAlignmentEnd    =  CoordMath.getEnd(rec.getMateAlignmentStart(), mateReferenceLength);
+                            properOrientation    = SamPairUtil.getPairOrientation(rec) == SamPairUtil.PairOrientation.FR;
+                            leftMostAlignedBase  = Math.min(rec.getAlignmentStart(), rec.getMateAlignmentStart());
+                            rightMostAlignedBase = Math.max(rec.getAlignmentEnd(), mateAlignmentEnd);
+                        }
+                    } else {
+                        properOrientation    = true; // can only be false for paired end reads
+                        leftMostAlignedBase  = rec.getAlignmentStart();
+                        rightMostAlignedBase = rec.getAlignmentEnd();
+                    }
+
+                    if (properOrientation && CoordMath.encloses(gene.getStart(), gene.getEnd(), leftMostAlignedBase, rightMostAlignedBase)) {
+                        if (negativeReadStrand == negativeTranscriptionStrand) {
+                            ++metrics.NUM_R1_TRANSCRIPT_STRAND_READS;
+                        } else {
+                            ++metrics.NUM_R2_TRANSCRIPT_STRAND_READS;
+                        }
+                    } else {
+                        ++metrics.NUM_UNEXPLAINED_READS;
+                    }
+                }
+            }
         }
 
         protected int getNumAlignedBases(SAMRecord rec) {
@@ -277,6 +312,12 @@ public void finish() {
             if (metrics.CORRECT_STRAND_READS > 0 || metrics.INCORRECT_STRAND_READS > 0) {
                 metrics.PCT_CORRECT_STRAND_READS = metrics.CORRECT_STRAND_READS/(double)(metrics.CORRECT_STRAND_READS + metrics.INCORRECT_STRAND_READS);
             }
+
+            final long readsExamined = metrics.NUM_R1_TRANSCRIPT_STRAND_READS + metrics.NUM_R2_TRANSCRIPT_STRAND_READS;
+            if (0 < readsExamined) {
+                metrics.PCT_R1_TRANSCRIPT_STRAND_READS = metrics.NUM_R1_TRANSCRIPT_STRAND_READS / (double) readsExamined;
+                metrics.PCT_R2_TRANSCRIPT_STRAND_READS = metrics.NUM_R2_TRANSCRIPT_STRAND_READS / (double) readsExamined;
+            }
         }
 
         @Override
diff --git a/src/test/java/picard/analysis/CollectRnaSeqMetricsTest.java b/src/test/java/picard/analysis/CollectRnaSeqMetricsTest.java
index d587791664..fd25a8503d 100644
--- a/src/test/java/picard/analysis/CollectRnaSeqMetricsTest.java
+++ b/src/test/java/picard/analysis/CollectRnaSeqMetricsTest.java
@@ -23,12 +23,7 @@
  */
 package picard.analysis;
 
-import htsjdk.samtools.SAMFileHeader;
-import htsjdk.samtools.SAMFileWriter;
-import htsjdk.samtools.SAMFileWriterFactory;
-import htsjdk.samtools.SAMReadGroupRecord;
-import htsjdk.samtools.SAMRecord;
-import htsjdk.samtools.SAMRecordSetBuilder;
+import htsjdk.samtools.*;
 import htsjdk.samtools.metrics.MetricsFile;
 import htsjdk.samtools.util.Interval;
 import htsjdk.samtools.util.IntervalList;
@@ -48,7 +43,7 @@ public String getCommandLineProgramName() {
     }
 
     @Test
-    public void basic() throws Exception {
+    public void testBasic() throws Exception {
         final String sequence = "chr1";
         final String ignoredSequence = "chrM";
 
@@ -92,7 +87,7 @@ public void basic() throws Exception {
                 "REF_FLAT=" +            getRefFlatFile(sequence).getAbsolutePath(),
                 "RIBOSOMAL_INTERVALS=" + rRnaIntervalsFile.getAbsolutePath(),
                 "STRAND_SPECIFICITY=SECOND_READ_TRANSCRIPTION_STRAND",
-                "IGNORE_SEQUENCE=" +ignoredSequence
+                "IGNORE_SEQUENCE=" + ignoredSequence
         };
         Assert.assertEquals(runPicardCommandLine(args), 0);
 
@@ -110,6 +105,11 @@ public void basic() throws Exception {
         Assert.assertEquals(metrics.CORRECT_STRAND_READS, 3);
         Assert.assertEquals(metrics.INCORRECT_STRAND_READS, 4);
         Assert.assertEquals(metrics.IGNORED_READS, 1);
+        Assert.assertEquals(metrics.NUM_R1_TRANSCRIPT_STRAND_READS, 1);
+        Assert.assertEquals(metrics.NUM_R2_TRANSCRIPT_STRAND_READS, 2);
+        Assert.assertEquals(metrics.NUM_UNEXPLAINED_READS, 2);
+        Assert.assertEquals(metrics.PCT_R1_TRANSCRIPT_STRAND_READS, 0.333333);
+        Assert.assertEquals(metrics.PCT_R2_TRANSCRIPT_STRAND_READS, 0.666667);
     }
 
     @Test
@@ -165,7 +165,7 @@ public void testMultiLevel() throws Exception {
                 "REF_FLAT=" +            getRefFlatFile(sequence).getAbsolutePath(),
                 "RIBOSOMAL_INTERVALS=" + rRnaIntervalsFile.getAbsolutePath(),
                 "STRAND_SPECIFICITY=SECOND_READ_TRANSCRIPTION_STRAND",
-                "IGNORE_SEQUENCE=" +ignoredSequence,
+                "IGNORE_SEQUENCE=" + ignoredSequence,
                 "LEVEL=null",
                 "LEVEL=SAMPLE",
                 "LEVEL=LIBRARY"
@@ -187,6 +187,11 @@ public void testMultiLevel() throws Exception {
                 Assert.assertEquals(metrics.CORRECT_STRAND_READS, 3);
                 Assert.assertEquals(metrics.INCORRECT_STRAND_READS, 4);
                 Assert.assertEquals(metrics.IGNORED_READS, 1);
+                Assert.assertEquals(metrics.NUM_R1_TRANSCRIPT_STRAND_READS, 1);
+                Assert.assertEquals(metrics.NUM_R2_TRANSCRIPT_STRAND_READS, 2);
+                Assert.assertEquals(metrics.NUM_UNEXPLAINED_READS, 2);
+                Assert.assertEquals(metrics.PCT_R1_TRANSCRIPT_STRAND_READS, 0.333333);
+                Assert.assertEquals(metrics.PCT_R2_TRANSCRIPT_STRAND_READS, 0.666667);
             }
             else if (metrics.LIBRARY.equals("foo")) {
                 Assert.assertEquals(metrics.PF_ALIGNED_BASES, 216);
@@ -199,7 +204,11 @@ else if (metrics.LIBRARY.equals("foo")) {
                 Assert.assertEquals(metrics.CORRECT_STRAND_READS, 3);
                 Assert.assertEquals(metrics.INCORRECT_STRAND_READS, 2);
                 Assert.assertEquals(metrics.IGNORED_READS, 0);
-
+                Assert.assertEquals(metrics.NUM_R1_TRANSCRIPT_STRAND_READS, 0);
+                Assert.assertEquals(metrics.NUM_R2_TRANSCRIPT_STRAND_READS, 2);
+                Assert.assertEquals(metrics.NUM_UNEXPLAINED_READS, 1);
+                Assert.assertEquals(metrics.PCT_R1_TRANSCRIPT_STRAND_READS, 0.0);
+                Assert.assertEquals(metrics.PCT_R2_TRANSCRIPT_STRAND_READS, 1.0);
             }
             else if (metrics.LIBRARY.equals("bar")) {
                 Assert.assertEquals(metrics.PF_ALIGNED_BASES, 180);
@@ -212,11 +221,104 @@ else if (metrics.LIBRARY.equals("bar")) {
                 Assert.assertEquals(metrics.CORRECT_STRAND_READS, 0);
                 Assert.assertEquals(metrics.INCORRECT_STRAND_READS, 2);
                 Assert.assertEquals(metrics.IGNORED_READS, 1);
-
+                Assert.assertEquals(metrics.NUM_R1_TRANSCRIPT_STRAND_READS, 1);
+                Assert.assertEquals(metrics.NUM_R2_TRANSCRIPT_STRAND_READS, 0);
+                Assert.assertEquals(metrics.NUM_UNEXPLAINED_READS, 1);
+                Assert.assertEquals(metrics.PCT_R1_TRANSCRIPT_STRAND_READS, 1.0);
+                Assert.assertEquals(metrics.PCT_R2_TRANSCRIPT_STRAND_READS, 0.0);
             }
         }
     }
 
+    @Test
+    public void testTranscriptionStrandMetrics() throws Exception {
+        final String sequence = "chr1";
+        final String ignoredSequence = "chrM";
+
+        // Create some alignments that hit the ribosomal sequence, various parts of the gene, and intergenic.
+        final SAMRecordSetBuilder builder = new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.coordinate);
+        // Set seed so that strandedness is consistent among runs.
+        builder.setRandomSeed(0);
+        builder.setReadLength(50);
+        final int sequenceIndex = builder.getHeader().getSequenceIndex(sequence);
+
+        // Reads that start *before* the gene: count as unexamined
+        builder.addPair("pair_prior_1", sequenceIndex, 45, 50, false, false, "50M", "50M", true, false, -1);
+        builder.addPair("pair_prior_2", sequenceIndex, 49, 50, false, false, "50M", "50M", true, false, -1);
+
+        // Read that is enclosed in the gene, but one end does not map: count as unexamined
+        builder.addPair("read_one_end_unmapped", sequenceIndex, 50, 51, false, true, "50M", null, false, false, -1);
+
+        // Reads that are enclosed in the gene, paired and frag: one count per template
+        builder.addFrag("frag_enclosed_forward", sequenceIndex, 150, false);
+        builder.addFrag("frag_enclosed_reverse", sequenceIndex, 150, true);
+        builder.addPair("pair_enclosed_forward", sequenceIndex, 150, 200, false, false, "50M", "50M", false, true, -1);
+        builder.addPair("pair_enclosed_reverse", sequenceIndex, 200, 150, false, false, "50M", "50M", true, false, -1);
+
+        // Read that starts *after* the gene: not counted
+        builder.addPair("pair_after_1", sequenceIndex, 545, 550, false, false, "50M", "50M", true, false, -1);
+        builder.addPair("pair_after_2", sequenceIndex, 549, 550, false, false, "50M", "50M", true, false, -1);
+
+        // A read that uses the read length instead of the mate cigar
+        builder.addFrag("frag_enclosed_forward_no_mate_cigar", sequenceIndex, 150, false).setAttribute(SAMTag.MC.name(), null);
+
+        // Duplicate reads are counted
+        builder.addFrag("frag_duplicate", sequenceIndex, 150, false).setDuplicateReadFlag(true);
+
+        // These reads should be ignored.
+        builder.addFrag("frag_secondary", sequenceIndex, 150, false).setNotPrimaryAlignmentFlag(true);
+        builder.addFrag("frag_supplementary", sequenceIndex, 150, false).setSupplementaryAlignmentFlag(true);
+        builder.addFrag("frag_qc_failure", sequenceIndex, 150, false).setReadFailsVendorQualityCheckFlag(true);
+
+        final File samFile = File.createTempFile("tmp.collectRnaSeqMetrics.", ".sam");
+        samFile.deleteOnExit();
+
+        final SAMFileWriter samWriter = new SAMFileWriterFactory().makeSAMWriter(builder.getHeader(), false, samFile);
+        for (final SAMRecord rec: builder.getRecords()) samWriter.addAlignment(rec);
+        samWriter.close();
+
+        // Create an interval list with one ribosomal interval.
+        final Interval rRnaInterval = new Interval(sequence, 300, 520, true, "rRNA");
+        final IntervalList rRnaIntervalList = new IntervalList(builder.getHeader());
+        rRnaIntervalList.add(rRnaInterval);
+        final File rRnaIntervalsFile = File.createTempFile("tmp.rRna.", ".interval_list");
+        rRnaIntervalsFile.deleteOnExit();
+        rRnaIntervalList.write(rRnaIntervalsFile);
+
+        // Generate the metrics.
+        final File metricsFile = File.createTempFile("tmp.", ".rna_metrics");
+        metricsFile.deleteOnExit();
+
+        final String[] args = new String[] {
+                "INPUT=" +               samFile.getAbsolutePath(),
+                "OUTPUT=" +              metricsFile.getAbsolutePath(),
+                "REF_FLAT=" +            getRefFlatFile(sequence).getAbsolutePath(),
+                "RIBOSOMAL_INTERVALS=" + rRnaIntervalsFile.getAbsolutePath(),
+                "STRAND_SPECIFICITY=SECOND_READ_TRANSCRIPTION_STRAND",
+                "IGNORE_SEQUENCE=" + ignoredSequence
+        };
+        Assert.assertEquals(runPicardCommandLine(args), 0);
+
+        final MetricsFile<RnaSeqMetrics, Comparable<?>> output = new MetricsFile<RnaSeqMetrics, Comparable<?>>();
+        output.read(new FileReader(metricsFile));
+
+        final RnaSeqMetrics metrics = output.getMetrics().get(0);
+        Assert.assertEquals(metrics.PF_ALIGNED_BASES, 900);
+        Assert.assertEquals(metrics.PF_BASES, 950);
+        Assert.assertEquals(metrics.RIBOSOMAL_BASES.longValue(), 0L);
+        Assert.assertEquals(metrics.CODING_BASES, 471);
+        Assert.assertEquals(metrics.UTR_BASES, 125);
+        Assert.assertEquals(metrics.INTRONIC_BASES, 98);
+        Assert.assertEquals(metrics.INTERGENIC_BASES, 206);
+        Assert.assertEquals(metrics.CORRECT_STRAND_READS, 7);
+        Assert.assertEquals(metrics.INCORRECT_STRAND_READS, 6);
+        Assert.assertEquals(metrics.IGNORED_READS, 0);
+        Assert.assertEquals(metrics.NUM_R1_TRANSCRIPT_STRAND_READS, 4);
+        Assert.assertEquals(metrics.NUM_R2_TRANSCRIPT_STRAND_READS, 2);
+        Assert.assertEquals(metrics.NUM_UNEXPLAINED_READS, 3);
+        Assert.assertEquals(metrics.PCT_R1_TRANSCRIPT_STRAND_READS, 0.666667);
+        Assert.assertEquals(metrics.PCT_R2_TRANSCRIPT_STRAND_READS, 0.333333);
+    }
 
     public File getRefFlatFile(String sequence) throws Exception {
         // Create a refFlat file with a single gene containing two exons, one of which is overlapped by the
@@ -242,5 +344,4 @@ public File getRefFlatFile(String sequence) throws Exception {
 
         return refFlatFile;
     }
-
 }