bigbio · jpfeuffer · Apr 29, 2022 · Apr 21, 2022 · Apr 21, 2022 · Apr 21, 2022
diff --git a/bin/msstats_plfq.R b/bin/msstats_plfq.R
@@ -1,68 +1,128 @@
 #!/usr/bin/env Rscript
 args = commandArgs(trailingOnly=TRUE)
+char_to_boolean = c("true"=TRUE, "false"=FALSE)
+usage <- "Rscript msstats_plfq.R input.csv [list of contrasts or 'pairwise'] [default control condition or ''] ..."
 
-usage <- "Rscript msstats_plfq.R input.csv input.mztab [list of contrasts or 'pairwise'] [default control condition or ''] [output prefix]"
-
-#TODO Waiting DIA mzTab
-if (length(args)<2) {
+#TODO rewrite mzTab in next version
+if (length(args)<1) {
     print(usage)
-    stop("At least the first two arguments must be supplied (input csv and input mzTab).n", call.=FALSE)
+    stop("At least the first one argument must be supplied (input csv).n", call.=FALSE)
 }
-if (length(args)<=2) {
+if (length(args)<2) {
     # contrasts
-    args[3] = "pairwise"
+    args[2] = "pairwise"
 }
-if (length(args)<=3) {
+if (length(args)<3) {
     # default control condition
-    args[4] = ""
+    args[3] = ""
+}
+if (length(args)<4) {
+    # removeOneFeatProts
+    args[4] = FALSE
+}
+removeOneFeatProts = args[4]
+if(typeof(removeOneFeatProts) == 'character'){
+    removeOneFeatProts = char_to_boolean[removeOneFeatProts]
+}
+
+if (length(args)<5) {
+    # keeps features with only one or two measurements across runs
+    args[5] = TRUE
+}
+fewMeasurements = args[5]
+if(typeof(fewMeasurements) == 'character'){
+    fewMeasurements = char_to_boolean[fewMeasurements]
+}
+
+if (length(args)<6) {
+    # which features to use for quantification per protein: 'top3' or 'highQuality' which removes outliers only"
+    args[6] = 'top3'
+}
+if (length(args)<7) {
+    # which summary method to use: 'TMP' (Tukey's median polish) or 'linear' (linear mixed model)
+    args[7] = 'TMP'
 }
-if (length(args)<=4) {
-    # default output prefix
-    args[5] = "msstats"
+if (length(args)<8) {
+    # outputPrefix
+    args[8] = './msstats'
 }
 
 csv_input <- args[1]
-mzTab_input <- args[2]
-contrast_str <- args[3]
-control_str <- args[4]
-out_prefix <- args[5]
-folder <- dirname(mzTab_input)
-filename <- basename(mzTab_input)
-mzTab_output <- paste0(folder,'/',out_prefix,filename)
+contrast_str <- args[2]
+control_str <- args[3]
 
 # load the MSstats library
 require(MSstats)
 require(tibble)
 require(data.table)
 
+# helper functions
+make_contrasts <- function(contrasts, levels)
+{
+    #helper function
+    indicatorRow <- function(pos,len)
+    {
+        row <- rep(0,len)
+        row[pos] <- 1
+        return(row)
+    }
+
+    if (is.factor(levels)) levels <- levels(levels)
+    if (!is.character(levels)) levels <- colnames(levels)
+
+    l <- length(levels)
+    if (l < 1)
+    {
+        stop("No levels given")
+    }
+
+    ncontr <- length(contrasts)
+    if (ncontr < 1)
+    {
+        stop("No contrasts given")
+    }
+
+    levelsenv <- new.env()
+    for (i in 1:l)
+    {
+        assign(levels[i], indicatorRow(i,l), pos=levelsenv)
+    }
+
+    contrastmat <- matrix(0, l, ncontr, dimnames=list(Levels=levels,Contrasts=contrasts))
+    for (j in 1:ncontr)
+    {
+        contrastsj <- parse(text=contrasts[j])
+        contrastmat[,j] <- eval(contrastsj, envir=levelsenv)
+    }
+    return(t(contrastmat))
+}
+
 # read dataframe into MSstats
 data <- read.csv(csv_input)
-quant <- OpenMStoMSstatsFormat(data, removeProtein_with1Feature = FALSE)
+quant <- OpenMStoMSstatsFormat(data, removeProtein_with1Feature = removeOneFeatProts, fewMeasurements=fewMeasurements)
 
 # process data
-processed.quant <- dataProcess(quant, censoredInt = 'NA')
+processed.quant <- dataProcess(quant, censoredInt = 'NA', featureSubset = args[6], summaryMethod = args[7])
 
 lvls <- levels(as.factor(data$Condition))
-if (length(lvls) == 1)
+l <- length(lvls)
+if (l == 1)
 {
     print("Only one condition found. No contrasts to be tested. If this is not the case, please check your experimental design.")
 } else {
     if (contrast_str == "pairwise")
     {
         if (control_str == "")
         {
-            l <- length(lvls)
-            contrast_mat <- matrix(nrow = l * (l-1) / 2, ncol = l)
-            rownames(contrast_mat) <- rep(NA, l * (l-1) / 2)
-            colnames(contrast_mat) <- lvls
+            contrast_mat <- matrix(nrow = l * (l-1) / 2, ncol = l, dimnames=list(Contrasts=rep(NA, l * (l-1) / 2), Levels=lvls))
             c <- 1
             for (i in 1:(l-1))
             {
                 for (j in (i+1):l)
                 {
                     comparison <- rep(0,l)
-                    comparison[i] <- -1
-                    comparison[j] <- 1
+                    comparison[i] <- 1
+                    comparison[j] <- -1
                     contrast_mat[c,] <- comparison
                     rownames(contrast_mat)[c] <- paste0(lvls[i],"-",lvls[j])
                     c <- c+1
@@ -75,10 +135,7 @@ if (length(lvls) == 1)
                 stop("Control condition not part of found levels.n", call.=FALSE)
             }
 
-            l <- length(lvls)
-            contrast_mat <- matrix(nrow = l-1, ncol = l)
-            rownames(contrast_mat) <- rep(NA, l-1)
-            colnames(contrast_mat) <- lvls
+            contrast_mat <- matrix(nrow = l-1, ncol = l, dimnames=list(Contrasts=rep(NA, l-1),Levels=lvls))
             c <- 1
             for (j in setdiff(1:l,control))
             {
@@ -91,32 +148,15 @@ if (length(lvls) == 1)
             }
         }
     } else {
-        print("Specific contrasts not supported yet.")
-        exit(1)
+        contrast_lst <- unlist(strsplit(contrast_str,";"))
+        contrast_mat <- make_contrasts(contrast_lst, lvls)
     }
 
     print ("Contrasts to be tested:")
     print (contrast_mat)
     #TODO allow for user specified contrasts
     test.MSstats <- groupComparison(contrast.matrix=contrast_mat, data=processed.quant)
 
-    #TODO allow manual input (e.g. proteins of interest)
-    write.csv(test.MSstats$ComparisonResult, "msstats_results.csv", row.names=FALSE, sep=",")
-
-    groupComparisonPlots(data=test.MSstats$ComparisonResult, type="ComparisonPlot",
-                        width=12, height=12,dot.size = 2)
-
-    test.MSstats$Volcano = test.MSstats$ComparisonResult[!is.na(test.MSstats$ComparisonResult$pvalue),]
-    groupComparisonPlots(data=test.MSstats$Volcano, type="VolcanoPlot",
-                        width=12, height=12,dot.size = 2)
-
-    # Otherwise it fails since the behaviour is undefined
-    if (nrow(contrast_mat) > 1)
-    {
-        groupComparisonPlots(data=test.MSstats$ComparisonResult, type="Heatmap",
-                            width=12, height=12,dot.size = 2)
-    }
-
     #for (comp in rownames(contrast_mat))
     #{
     #  groupComparisonPlots(data=test.MSstats$ComparisonResult, type="ComparisonPlot",
@@ -164,160 +204,30 @@ if (length(lvls) == 1)
     test.MSstats$ComparisonResult[, commoncols] <- apply(test.MSstats$Comparison[, commoncols], 2, function(x) {x[is.na(x)] <- 1; return(x)})
 
     #write comparison to CSV (one CSV per contrast)
-    writeComparisonToCSV <- function(DF)
-    {
-        write.table(DF, file=paste0("comparison_",unique(DF$Label),".csv"), quote=FALSE, sep='\t', row.names = FALSE)
-        return(DF)
-    }
-    ComparisonResultSplit <- split(test.MSstats$ComparisonResult, test.MSstats$ComparisonResult$Label)
-    for(i in 1:length(ComparisonResultSplit)){
-        writeComparisonToCSV(ComparisonResultSplit[[i]])
-    }
-
-
-    #replace quants in mzTab
-    ################# MzTab
-    # find start of the section
-    startSection <- function(file, section.identifier) {
-        data <- file(file, "r")
-        row = 0
-        while (TRUE) {
-            row = row + 1
-            line = readLines(data, n=1)
-            if (substr(line, 1, 3)==section.identifier) {
-                break
-            }
-        }
-        close(data)
-        return (row)
-    }
-
-    # find start of the mzTab section tables
-    MTD.first_row <- startSection(mzTab_input, "MTD")
-    PRT.first_row <- startSection(mzTab_input, "PRH")
-    PEP.first_row <- startSection(mzTab_input, "PEH")
-    PSM.first_row <- startSection(mzTab_input, "PSH")
-
-    # read entire mzTab and extract protein data
-    MTD <- read.table(mzTab_input, sep="\t",
-    skip=MTD.first_row-1,
-    nrows=PRT.first_row - MTD.first_row - 1 -1, # one extra empty line
-    fill=TRUE,
-    header=TRUE,
-    quote="",
-    na.strings=c("null","NA"),
-    stringsAsFactors=FALSE,
-    check.names=FALSE)
-
-
-    PRT <- read.table(mzTab_input, sep="\t",
-    skip=PRT.first_row-1,
-    nrows=PEP.first_row - PRT.first_row - 1 -1, # one extra empty line
-    fill=TRUE,
-    header=TRUE,
-    quote="",
-    na.strings=c("null","NA"),
-    stringsAsFactors=FALSE,
-    check.names=FALSE)
-
-    noquant <- as.logical(PRT[,"opt_global_result_type"] == 'protein_details')
-    PRT_skipped <- PRT[noquant,]
-    PRT <- PRT[!noquant,]
-
-    PEP <- read.table(mzTab_input, sep="\t",
-    skip=PEP.first_row-1,
-    nrows=PSM.first_row - PEP.first_row - 1 - 1, # one extra empty line
-    fill=TRUE,
-    header=TRUE,
-    quote="",
-    na.strings=c("null","NA"),
-    stringsAsFactors=FALSE,
-    check.names=FALSE)
-
-    PSM <- read.table(mzTab_input, sep="\t",
-    skip=PSM.first_row-1,
-    fill=TRUE,
-    header=TRUE,
-    quote="",
-    na.strings=c("null","NA"),
-    stringsAsFactors=FALSE,
-    check.names=FALSE)
-
-    #### Insert quantification data from MSstats into PRT section
-    # first we create a run level protein table form MSstats output
-    # then we merge the values into the mzTab PRT table
-
-
-    # Input: MSstats RunLevelData
-    # Output: Run level quantification
-    # Create a run level protein table
-    #   PROTEIN                 `1`   `2`   `3`   `4`   `5`   `6`   `7`   `8`   `9`  `10`  `11`  `12`  `13`  `14`  `15`  `16`  `17`  `18`  `19`  `20`
-    #   <fct>                 <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-    # 1 sp|A1ANS1|HTPG_PELPD   24.2  24.9  22.8  25.3  24.7  22.9  24.6  25.1  24.0  22.1  25.0  24.3  23.6  NA    NA    NA    NA    NA    NA    NA
-    # 2 sp|A2I7N1|SPA35_BOVIN  22.9  23.6  22.4  23.8  23.4  NA    23.6  23.9  22.5  NA    23.7  23.5  22.5  22.5  23.0  23.0  22.6  22.2  22.1  22.8
-    getRunLevelQuant <- function(runLevelData)
-    {
-        runlevel.long <- data.frame()
-        for (name in names(runLevelData)) {
-            singleProtein.long <- sapply(runLevelData[[name]], rbind)
-            singleProtein.long <- data.frame(singleProtein.long)
-            singleProtein.long$Protein <- name
-            runlevel.long <- rbind(runlevel.long, singleProtein.long)
-        }
-        runlevel.wide <- dcast(setDT(runlevel.long), Protein~RUN, value.var = "LogIntensities")
-        runlevel.wide <- as.data.frame(runlevel.wide)
-        return(runlevel.wide)
-    }
-
-    quant.runLevel <- MSstatsPrepareForGroupComparison(processed.quant)
-    quant.runLevel <- getRunLevelQuant(quant.runLevel)
-    colnames(quant.runLevel)[1] = "accession"
+    # writeComparisonToCSV <- function(DF)
+    # {
+    #     write.table(DF, file=paste0("comparison_",unique(DF$Label),".csv"), quote=FALSE, sep='\t', row.names = FALSE)
+    #     return(DF)
+    # }
+    # ComparisonResultSplit <- split(test.MSstats$ComparisonResult, test.MSstats$ComparisonResult$Label)
+    # for(i in 1:length(ComparisonResultSplit)){
+    #     writeComparisonToCSV(ComparisonResultSplit[[i]])
+    # }
+
+    #write all comparisons into one CSV file
+    write.table(test.MSstats$ComparisonResult, file=paste0(args[8],"_comparisons.csv"), quote=FALSE, sep='\t', row.names = FALSE)
 
-    quant.runLevel$accession<-as.character(quant.runLevel$accession)
-
-    for (col_nr in seq(from=2, to=length(colnames(quant.runLevel))))
-    {
-        colnames(quant.runLevel)[col_nr]=(paste0("protein_abundance_assay[", colnames(quant.runLevel)[col_nr] , "]"))
-    }
-
-    # TODO: check if assays in MzTab match to runs. Also true for fractionated data?
-
-    # clear old quant values from ProteinQuantifier
-    PRT[,grepl( "protein_abundance_assay" , names(PRT))] = NA
-    PRT[,grepl( "protein_abundance_study_variable" , names(PRT))] = NA
+    groupComparisonPlots(data=test.MSstats$ComparisonResult, type="ComparisonPlot",
+                        width=12, height=12,dot.size = 2)
 
-    # merge in quant.runLevel values into PRT
-    PRT_assay_cols <- grepl("protein_abundance_assay", names(PRT))
-    PRT_stdv_cols <- grepl("protein_abundance_study_variable", names(PRT))
-    RL_assay_cols <- grepl("protein_abundance_assay", names(quant.runLevel))
+    test.MSstats$Volcano = test.MSstats$ComparisonResult[!is.na(test.MSstats$ComparisonResult$pvalue),]
+    groupComparisonPlots(data=test.MSstats$Volcano, type="VolcanoPlot",
+                        width=12, height=12,dot.size = 2)
 
-    for (acc in quant.runLevel$accession)
+    # Otherwise it fails since the behaviour is undefined
+    if (nrow(contrast_mat) > 1)
     {
-        q<-which(quant.runLevel$accession==acc)
-
-        # acc from MSstats might be a group e.g., "A;B" so
-        # we check the single leader protein in mzTab PRT$accession against both A and B
-        w<-which(PRT$accession %in% strsplit(acc, ";", fixed=TRUE)[[1]])
-
-        if (length(w) == 0)
-        {
-            # TODO: check why not all summarized protein accessions are in the mzTab. Minimum number of peptides/features different?
-            print(paste("Warning: ", acc, " not in mzTab but reported by MSstats"))
-        }
-        else
-        {
-            PRT[w, PRT_assay_cols] <- quant.runLevel[q, RL_assay_cols]
-            PRT[w, PRT_stdv_cols] <- quant.runLevel[q, RL_assay_cols] # we currently store same data in stdv and assay column
-        }
+        groupComparisonPlots(data=test.MSstats$ComparisonResult, type="Heatmap",
+                            width=12, height=12,dot.size = 2)
     }
-
-    write.table(MTD, mzTab_output, sep = "\t", quote=FALSE, row.names = FALSE, na = "null")
-    write("",file=mzTab_output,append=TRUE)
-    suppressWarnings(write.table(PRT_skipped, mzTab_output, sep = "\t", quote=FALSE, row.names = FALSE, append=TRUE, na = "null"))
-    suppressWarnings(write.table(PRT, mzTab_output, sep = "\t", col.names=FALSE, quote=FALSE, row.names = FALSE, append=TRUE, na = "null"))
-    write("",file=mzTab_output,append=TRUE)
-    suppressWarnings(write.table(PEP, mzTab_output, sep = "\t", quote=FALSE, row.names = FALSE, append=TRUE, na = "null"))
-    write("",file=mzTab_output,append=TRUE)
-    suppressWarnings(write.table(PSM, mzTab_output, sep = "\t", quote=FALSE, row.names = FALSE, append=TRUE, na = "null"))
-
 }