From 1b35122c95606b595172fd477e1bbca5d5390ef5 Mon Sep 17 00:00:00 2001
From: jfsanchez <jfbioinformatics@gmail.com>
Date: Thu, 28 Mar 2019 12:23:47 +0100
Subject: [PATCH] Raw/clean length or coverage in one file

---
 scripts/R/plot-covg-hist_raw-clean.R   | 83 ++++++++++++++++++++++++++
 scripts/R/plot-length-hist_raw-clean.R | 40 +++++++++++++
 2 files changed, 123 insertions(+)
 create mode 100755 scripts/R/plot-covg-hist_raw-clean.R
 create mode 100755 scripts/R/plot-length-hist_raw-clean.R
diff --git a/scripts/R/plot-covg-hist_raw-clean.R b/scripts/R/plot-covg-hist_raw-clean.R
new file mode 100755
index 00000000..5fed97c3
--- /dev/null
+++ b/scripts/R/plot-covg-hist_raw-clean.R
@@ -0,0 +1,83 @@
+#!/usr/bin/env Rscript --vanilla
+## modified by: JFSanchezHerrero: https://github.com/JFsanchezherrero
+## 28 March, 2019
+
+# Plot coverage histograms generated by e.g. 'mccortex31 clean --covg-before out.csv ...'
+# input csv should have the columns: 'Covg' and 'NumUnitigs'
+#
+args <- commandArgs(trailingOnly=TRUE)
+if(length(args) < 2 || length(args) > 5) {
+  stop(paste("Usage: Rscript --vanilla plot-covg-hist.R <covg.csv> <out.pdf> [<cutoff> [<kcov>]]",
+             "  raw_covg.csv  - raw coverage CSV file with 'Covg', 'NumUnitigs' headings",
+             "  clean_covg.csv  - clean coverage CSV file with 'Covg', 'NumUnitigs' headings",
+             "  out.pdf   - output pdf file",
+             "  cutoff    - cleaning cutoff used [optional]",
+             "  kcov      - kmer coverage of sample [optional]",
+             "",
+             sep="\n"))
+}
+cutoff <- 0
+kcov <- 0
+
+input_raw_csv <- args[1]
+input_clean_csv <- args[2]
+output_pdf <- args[3]
+if(length(args) >= 4) { cutoff <- as.numeric(args[4]) }
+if(length(args) >= 5) { kcov <- as.numeric(args[5]) }
+
+library('ggplot2')
+library('gridExtra')
+
+read_file_and_plot = function(file2read, colourGiven, name) {
+  d <- read.csv(file=file2read,sep=',',as.is=T)
+  ymax <- max(d[,'NumKmers'],d[,'NumUnitigs'])
+  cutline <- data.frame(x=as.numeric(c(cutoff,cutoff)), y=as.numeric(c(0,ymax)))
+  kcovline <- data.frame(x=as.numeric(c(kcov,kcov)), y=as.numeric(c(0,ymax)))
+  
+  p <- ggplot() +
+    geom_bar(data=d, stat="identity", aes(x=Covg,y=NumKmers,fill=FALSE), colour=colourGiven) +
+    guides(fill=FALSE) +
+    xlab("Coverage (X)") +
+    ylab("Number of kmers") +
+    ggtitle(name) +
+    xlim(0,500)
+  
+  if(cutoff > 0) {
+    cutline[,'y'] = c(0, max(d[,'NumKmers']))
+    p <- p + geom_line(data=cutline, aes(x=x,y=y), colour="black")
+  }
+  
+  if(kcov > 0) {
+    kcovline[,'y'] = c(0, max(d[,'NumKmers']))
+    p <- p + geom_line(data=kcovline, aes(x=x,y=y), colour="black", linetype="dashed")
+  }
+  
+  q <- ggplot() +
+    geom_bar(data=d, stat="identity", aes(x=Covg,y=NumUnitigs,fill=FALSE), colour=colourGiven) +
+    guides(fill=FALSE) +
+    xlab("Coverage (X)") +
+    ylab("Number of unitigs") +
+    ggtitle(name) +
+    xlim(0,500)
+  
+  if(cutoff > 0) {
+    cutline[,'y'] = c(0, max(d[,'NumUnitigs']))
+    q <- q + geom_line(data=cutline, aes(x=x,y=y), colour="black")
+  }
+  
+  if(kcov > 0) {
+    kcovline[,'y'] = c(0, max(d[,'NumUnitigs']))
+    q <- q + geom_line(data=kcovline, aes(x=x,y=y), colour="black", linetype="dashed")
+  }
+  
+  list2return <- list("NumKmers" = p, "NumUnitigs" = q)
+  return(list2return)
+}
+
+## plot raw vs clean
+raw <- read_file_and_plot(file2read = input_raw_csv, colourGiven = "firebrick", name="Raw")
+clean <- read_file_and_plot(file2read = input_clean_csv, colourGiven = "blue", name = "Clean")
+
+pdf(output_pdf,width=12,height=6)
+grid.arrange(raw$NumKmers, raw$NumUnitigs, clean$NumKmers, clean$NumUnitigs, ncol=2, left="Kmer coverage", right="Unitig Median Coverage")
+dev.off()
diff --git a/scripts/R/plot-length-hist_raw-clean.R b/scripts/R/plot-length-hist_raw-clean.R
new file mode 100755
index 00000000..bda1aee0
--- /dev/null
+++ b/scripts/R/plot-length-hist_raw-clean.R
@@ -0,0 +1,40 @@
+#!/usr/bin/env Rscript --vanilla
+## modified by: JFSanchezHerrero: https://github.com/JFsanchezherrero
+## 28 March, 2019
+
+# Plot coverage histograms generated by e.g. 'mccortex31 clean --length-before out.csv ...'
+# input csv should have the columns: 'bp' and 'Count'
+#
+args <- commandArgs(trailingOnly=TRUE)
+if(length(args) != 3) {
+  stop("Usage: Rscript --vanilla plot-hist-hist.R <raw_lengths.csv> <clean_lengths.csv> <lengths.pdf>\n")
+}
+
+raw_input_csv = args[1]
+clean_input_csv = args[2]
+output_pdf = args[3]
+
+library('ggplot2')
+library('gridExtra')
+
+plot_length <- function(fileGiven, colourGiven, titleGiven) {
+
+  d=read.csv(file=fileGiven, sep=',',as.is=T)
+  
+  p <- ggplot(data=d, aes(bp, Count)) +
+    geom_bar(stat="identity", color=colourGiven, fill=colourGiven) +
+    scale_y_log10() +
+    xlab("Untig length (bases)") +
+    ylab("Number of unitigs") +
+    ggtitle(titleGiven)
+  
+  return(list("plot" = p))
+
+}
+
+raw <- plot_length(fileGiven = raw_input_csv, colourGiven = "seagreen", titleGiven = "Raw Unitig Length")
+clean <- plot_length(fileGiven = clean_input_csv , colourGiven = "red", titleGiven = "Clean Unitig Length")
+
+pdf(output_pdf,width=12,height=6)
+grid.arrange(raw$plot, clean$plot, ncol=2)
+dev.off()
\ No newline at end of file