diff --git a/assets/PXD026600.sdrf.tsv b/assets/PXD026600.sdrf.tsv index f2d97378..ed709b07 100644 --- a/assets/PXD026600.sdrf.tsv +++ b/assets/PXD026600.sdrf.tsv @@ -1,3 +1,5 @@ Source Name Characteristics[organism] Characteristics[organism part] Characteristics[age] Characteristics[strain] Characteristics[developmental stage] Characteristics[cell line] Characteristics[cell type] Characteristics[sex] Characteristics[disease] characteristics[mass] characteristics[spiked compound] Characteristics[biological replicate] Material Type assay name technology type comment[data file] comment[file uri] comment[technical replicate] comment[fraction identifier] comment[proteomics data acquisition method] comment[label] comment[instrument] comment[modification parameters] comment[modification parameters] comment[cleavage agent details] comment[dissociation method] comment[collision energy] comment[precursor mass tolerance] comment[fragment mass tolerance] factor value[spiked compound] Sample 1 Escherichia coli K-12 whole plant not available K12 not available not applicable not available not available not available 1 ug CT=Mixture;CN=UPS1;QY=0.1 fmol 1 whole organism run 1 proteomic profiling by mass spectrometry RD139_Narrow_UPS1_0_1fmol_inj1.raw ftp://massive.ucsd.edu/MSV000087597/raw/RD139_Raw_files_DIA_Narrow/RD139_Narrow_UPS1_0_1fmol_inj1.raw 1 1 NT=Data-Independent Acquisition;AC=NCIT:C161786 AC=MS:1002038;NT=label free sample NT=Orbitrap Fusion;AC=MS:1002416 NT=Oxidation;MT=Variable;TA=M;AC=Unimod:35 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 AC=MS:1001313;NT=Trypsin NT=HCD;AC=PRIDE:0000590 35% CE 10 ppm 20 mmu CT=Mixture;CN=UPS1;QY=0.1 fmol Sample 1 Escherichia coli K-12 whole plant not available K12 not available not applicable not available not available not available 1 ug CT=Mixture;CN=UPS1;QY=0.1 fmol 1 whole organism run 2 proteomic profiling by mass spectrometry RD139_Narrow_UPS1_0_1fmol_inj2.raw ftp://massive.ucsd.edu/MSV000087597/raw/RD139_Raw_files_DIA_Narrow/RD139_Narrow_UPS1_0_1fmol_inj2.raw 2 1 NT=Data-Independent Acquisition;AC=NCIT:C161786 AC=MS:1002038;NT=label free sample NT=Orbitrap Fusion;AC=MS:1002416 NT=Oxidation;MT=Variable;TA=M;AC=Unimod:35 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 AC=MS:1001313;NT=Trypsin NT=HCD;AC=PRIDE:0000590 35% CE 10 ppm 20 mmu CT=Mixture;CN=UPS1;QY=0.1 fmol +Sample 2 Escherichia coli K-12 whole plant not available K12 not available not applicable not available not available not available 1 ug CT=Mixture;CN=UPS1;QY=0.25 fmol 1 whole organism run 3 proteomic profiling by mass spectrometry RD139_Narrow_UPS1_0_25fmol_inj1.raw ftp://massive.ucsd.edu/MSV000087597/raw/RD139_Raw_files_DIA_Narrow/RD139_Narrow_UPS1_0_25fmol_inj1.raw 1 1 NT=Data-Independent Acquisition;AC=NCIT:C161786 AC=MS:1002038;NT=label free sample NT=Orbitrap Fusion;AC=MS:1002416 NT=Oxidation;MT=Variable;TA=M;AC=Unimod:35 NT=Carbamidomethyl;TA=C;MT=Fixed;AC=UNIMOD:4 AC=MS:1001313;NT=Trypsin NT=HCD;AC=PRIDE:0000590 35% CE 10 ppm 20 mmu CT=Mixture;CN=UPS1;QY=0.25 fmol +Sample 2 Escherichia coli K-12 whole plant not available K12 not available not applicable not available not available not available 1 ug CT=Mixture;CN=UPS1;QY=0.25 fmol 1 whole organism run 4 proteomic profiling by mass spectrometry RD139_Narrow_UPS1_0_25fmol_inj2.raw ftp://massive.ucsd.edu/MSV000087597/raw/RD139_Raw_files_DIA_Narrow/RD139_Narrow_UPS1_0_25fmol_inj2.raw 2 1 NT=Data-Independent Acquisition;AC=NCIT:C161786 AC=MS:1002038;NT=label free sample NT=Orbitrap Fusion;AC=MS:1002416 NT=Oxidation;MT=Variable;TA=M;AC=Unimod:35 NT=Carbamidomethyl;TA=C;MT=Fixed;AC=UNIMOD:4 AC=MS:1001313;NT=Trypsin NT=HCD;AC=PRIDE:0000590 35% CE 10 ppm 20 mmu CT=Mixture;CN=UPS1;QY=0.25 fmol diff --git a/bin/msstats_plfq.R b/bin/msstats_plfq.R index d65dfcec..7689e745 100755 --- a/bin/msstats_plfq.R +++ b/bin/msstats_plfq.R @@ -1,49 +1,112 @@ #!/usr/bin/env Rscript args = commandArgs(trailingOnly=TRUE) +char_to_boolean = c("true"=TRUE, "false"=FALSE) +usage <- "Rscript msstats_plfq.R input.csv [list of contrasts or 'pairwise'] [default control condition or ''] ..." -usage <- "Rscript msstats_plfq.R input.csv input.mztab [list of contrasts or 'pairwise'] [default control condition or ''] [output prefix]" - -#TODO Waiting DIA mzTab -if (length(args)<2) { +#TODO rewrite mzTab in next version +if (length(args)<1) { print(usage) - stop("At least the first two arguments must be supplied (input csv and input mzTab).n", call.=FALSE) + stop("At least the first argument must be supplied (input csv).n", call.=FALSE) } -if (length(args)<=2) { +if (length(args)<2) { # contrasts - args[3] = "pairwise" + args[2] = "pairwise" } -if (length(args)<=3) { +if (length(args)<3) { # default control condition - args[4] = "" + args[3] = "" +} +if (length(args)<4) { + # removeOneFeatProts + args[4] = FALSE +} +removeOneFeatProts = args[4] +if(typeof(removeOneFeatProts) == 'character'){ + removeOneFeatProts = char_to_boolean[removeOneFeatProts] +} + +if (length(args)<5) { + # keeps features with only one or two measurements across runs + args[5] = TRUE +} +removeFewMeasurements = args[5] +if(typeof(removeFewMeasurements) == 'character'){ + removeFewMeasurements = char_to_boolean[removeFewMeasurements] +} + +if (length(args)<6) { + # which features to use for quantification per protein: 'top3' or 'highQuality' which removes outliers only" + args[6] = 'top3' +} +if (length(args)<7) { + # which summary method to use: 'TMP' (Tukey's median polish) or 'linear' (linear mixed model) + args[7] = 'TMP' } -if (length(args)<=4) { - # default output prefix - args[5] = "msstats" +if (length(args)<8) { + # outputPrefix + args[8] = './msstats' } csv_input <- args[1] -mzTab_input <- args[2] -contrast_str <- args[3] -control_str <- args[4] -out_prefix <- args[5] -folder <- dirname(mzTab_input) -filename <- basename(mzTab_input) -mzTab_output <- paste0(folder,'/',out_prefix,filename) +contrast_str <- args[2] +control_str <- args[3] # load the MSstats library require(MSstats) require(tibble) require(data.table) +# helper functions +make_contrasts <- function(contrasts, levels) +{ + #helper function + indicatorRow <- function(pos,len) + { + row <- rep(0,len) + row[pos] <- 1 + return(row) + } + + if (is.factor(levels)) levels <- levels(levels) + if (!is.character(levels)) levels <- colnames(levels) + + l <- length(levels) + if (l < 1) + { + stop("No levels given") + } + + ncontr <- length(contrasts) + if (ncontr < 1) + { + stop("No contrasts given") + } + + levelsenv <- new.env() + for (i in 1:l) + { + assign(levels[i], indicatorRow(i,l), pos=levelsenv) + } + + contrastmat <- matrix(0, l, ncontr, dimnames=list(Levels=levels,Contrasts=contrasts)) + for (j in 1:ncontr) + { + contrastsj <- parse(text=contrasts[j]) + contrastmat[,j] <- eval(contrastsj, envir=levelsenv) + } + return(t(contrastmat)) +} + # read dataframe into MSstats data <- read.csv(csv_input) -quant <- OpenMStoMSstatsFormat(data, removeProtein_with1Feature = FALSE) +quant <- OpenMStoMSstatsFormat(data, removeProtein_with1Feature = removeOneFeatProts, removeFewMeasurements=removeFewMeasurements) # process data -processed.quant <- dataProcess(quant, censoredInt = 'NA') +processed.quant <- dataProcess(quant, censoredInt = 'NA', featureSubset = args[6], summaryMethod = args[7]) lvls <- levels(as.factor(data$Condition)) -if (length(lvls) == 1) +l <- length(lvls) +if (l == 1) { print("Only one condition found. No contrasts to be tested. If this is not the case, please check your experimental design.") } else { @@ -51,18 +114,15 @@ if (length(lvls) == 1) { if (control_str == "") { - l <- length(lvls) - contrast_mat <- matrix(nrow = l * (l-1) / 2, ncol = l) - rownames(contrast_mat) <- rep(NA, l * (l-1) / 2) - colnames(contrast_mat) <- lvls + contrast_mat <- matrix(nrow = l * (l-1) / 2, ncol = l, dimnames=list(Contrasts=rep(NA, l * (l-1) / 2), Levels=lvls)) c <- 1 for (i in 1:(l-1)) { for (j in (i+1):l) { comparison <- rep(0,l) - comparison[i] <- -1 - comparison[j] <- 1 + comparison[i] <- 1 + comparison[j] <- -1 contrast_mat[c,] <- comparison rownames(contrast_mat)[c] <- paste0(lvls[i],"-",lvls[j]) c <- c+1 @@ -75,10 +135,7 @@ if (length(lvls) == 1) stop("Control condition not part of found levels.n", call.=FALSE) } - l <- length(lvls) - contrast_mat <- matrix(nrow = l-1, ncol = l) - rownames(contrast_mat) <- rep(NA, l-1) - colnames(contrast_mat) <- lvls + contrast_mat <- matrix(nrow = l-1, ncol = l, dimnames=list(Contrasts=rep(NA, l-1),Levels=lvls)) c <- 1 for (j in setdiff(1:l,control)) { @@ -91,8 +148,8 @@ if (length(lvls) == 1) } } } else { - print("Specific contrasts not supported yet.") - exit(1) + contrast_lst <- unlist(strsplit(contrast_str,";")) + contrast_mat <- make_contrasts(contrast_lst, lvls) } print ("Contrasts to be tested:") @@ -100,23 +157,6 @@ if (length(lvls) == 1) #TODO allow for user specified contrasts test.MSstats <- groupComparison(contrast.matrix=contrast_mat, data=processed.quant) - #TODO allow manual input (e.g. proteins of interest) - write.csv(test.MSstats$ComparisonResult, "msstats_results.csv", row.names=FALSE, sep=",") - - groupComparisonPlots(data=test.MSstats$ComparisonResult, type="ComparisonPlot", - width=12, height=12,dot.size = 2) - - test.MSstats$Volcano = test.MSstats$ComparisonResult[!is.na(test.MSstats$ComparisonResult$pvalue),] - groupComparisonPlots(data=test.MSstats$Volcano, type="VolcanoPlot", - width=12, height=12,dot.size = 2) - - # Otherwise it fails since the behaviour is undefined - if (nrow(contrast_mat) > 1) - { - groupComparisonPlots(data=test.MSstats$ComparisonResult, type="Heatmap", - width=12, height=12,dot.size = 2) - } - #for (comp in rownames(contrast_mat)) #{ # groupComparisonPlots(data=test.MSstats$ComparisonResult, type="ComparisonPlot", @@ -164,160 +204,30 @@ if (length(lvls) == 1) test.MSstats$ComparisonResult[, commoncols] <- apply(test.MSstats$Comparison[, commoncols], 2, function(x) {x[is.na(x)] <- 1; return(x)}) #write comparison to CSV (one CSV per contrast) - writeComparisonToCSV <- function(DF) - { - write.table(DF, file=paste0("comparison_",unique(DF$Label),".csv"), quote=FALSE, sep='\t', row.names = FALSE) - return(DF) - } - ComparisonResultSplit <- split(test.MSstats$ComparisonResult, test.MSstats$ComparisonResult$Label) - for(i in 1:length(ComparisonResultSplit)){ - writeComparisonToCSV(ComparisonResultSplit[[i]]) - } - - - #replace quants in mzTab - ################# MzTab - # find start of the section - startSection <- function(file, section.identifier) { - data <- file(file, "r") - row = 0 - while (TRUE) { - row = row + 1 - line = readLines(data, n=1) - if (substr(line, 1, 3)==section.identifier) { - break - } - } - close(data) - return (row) - } - - # find start of the mzTab section tables - MTD.first_row <- startSection(mzTab_input, "MTD") - PRT.first_row <- startSection(mzTab_input, "PRH") - PEP.first_row <- startSection(mzTab_input, "PEH") - PSM.first_row <- startSection(mzTab_input, "PSH") - - # read entire mzTab and extract protein data - MTD <- read.table(mzTab_input, sep="\t", - skip=MTD.first_row-1, - nrows=PRT.first_row - MTD.first_row - 1 -1, # one extra empty line - fill=TRUE, - header=TRUE, - quote="", - na.strings=c("null","NA"), - stringsAsFactors=FALSE, - check.names=FALSE) - - - PRT <- read.table(mzTab_input, sep="\t", - skip=PRT.first_row-1, - nrows=PEP.first_row - PRT.first_row - 1 -1, # one extra empty line - fill=TRUE, - header=TRUE, - quote="", - na.strings=c("null","NA"), - stringsAsFactors=FALSE, - check.names=FALSE) - - noquant <- as.logical(PRT[,"opt_global_result_type"] == 'protein_details') - PRT_skipped <- PRT[noquant,] - PRT <- PRT[!noquant,] - - PEP <- read.table(mzTab_input, sep="\t", - skip=PEP.first_row-1, - nrows=PSM.first_row - PEP.first_row - 1 - 1, # one extra empty line - fill=TRUE, - header=TRUE, - quote="", - na.strings=c("null","NA"), - stringsAsFactors=FALSE, - check.names=FALSE) - - PSM <- read.table(mzTab_input, sep="\t", - skip=PSM.first_row-1, - fill=TRUE, - header=TRUE, - quote="", - na.strings=c("null","NA"), - stringsAsFactors=FALSE, - check.names=FALSE) - - #### Insert quantification data from MSstats into PRT section - # first we create a run level protein table form MSstats output - # then we merge the values into the mzTab PRT table - - - # Input: MSstats RunLevelData - # Output: Run level quantification - # Create a run level protein table - # PROTEIN `1` `2` `3` `4` `5` `6` `7` `8` `9` `10` `11` `12` `13` `14` `15` `16` `17` `18` `19` `20` - # - # 1 sp|A1ANS1|HTPG_PELPD 24.2 24.9 22.8 25.3 24.7 22.9 24.6 25.1 24.0 22.1 25.0 24.3 23.6 NA NA NA NA NA NA NA - # 2 sp|A2I7N1|SPA35_BOVIN 22.9 23.6 22.4 23.8 23.4 NA 23.6 23.9 22.5 NA 23.7 23.5 22.5 22.5 23.0 23.0 22.6 22.2 22.1 22.8 - getRunLevelQuant <- function(runLevelData) - { - runlevel.long <- data.frame() - for (name in names(runLevelData)) { - singleProtein.long <- sapply(runLevelData[[name]], rbind) - singleProtein.long <- data.frame(singleProtein.long) - singleProtein.long$Protein <- name - runlevel.long <- rbind(runlevel.long, singleProtein.long) - } - runlevel.wide <- dcast(setDT(runlevel.long), Protein~RUN, value.var = "LogIntensities") - runlevel.wide <- as.data.frame(runlevel.wide) - return(runlevel.wide) - } - - quant.runLevel <- MSstatsPrepareForGroupComparison(processed.quant) - quant.runLevel <- getRunLevelQuant(quant.runLevel) - colnames(quant.runLevel)[1] = "accession" + # writeComparisonToCSV <- function(DF) + # { + # write.table(DF, file=paste0("comparison_",unique(DF$Label),".csv"), quote=FALSE, sep='\t', row.names = FALSE) + # return(DF) + # } + # ComparisonResultSplit <- split(test.MSstats$ComparisonResult, test.MSstats$ComparisonResult$Label) + # for(i in 1:length(ComparisonResultSplit)){ + # writeComparisonToCSV(ComparisonResultSplit[[i]]) + # } + + #write all comparisons into one CSV file + write.table(test.MSstats$ComparisonResult, file=paste0(args[8],"_comparisons.csv"), quote=FALSE, sep='\t', row.names = FALSE) - quant.runLevel$accession<-as.character(quant.runLevel$accession) - - for (col_nr in seq(from=2, to=length(colnames(quant.runLevel)))) - { - colnames(quant.runLevel)[col_nr]=(paste0("protein_abundance_assay[", colnames(quant.runLevel)[col_nr] , "]")) - } - - # TODO: check if assays in MzTab match to runs. Also true for fractionated data? - - # clear old quant values from ProteinQuantifier - PRT[,grepl( "protein_abundance_assay" , names(PRT))] = NA - PRT[,grepl( "protein_abundance_study_variable" , names(PRT))] = NA + groupComparisonPlots(data=test.MSstats$ComparisonResult, type="ComparisonPlot", + width=12, height=12,dot.size = 2) - # merge in quant.runLevel values into PRT - PRT_assay_cols <- grepl("protein_abundance_assay", names(PRT)) - PRT_stdv_cols <- grepl("protein_abundance_study_variable", names(PRT)) - RL_assay_cols <- grepl("protein_abundance_assay", names(quant.runLevel)) + test.MSstats$Volcano = test.MSstats$ComparisonResult[!is.na(test.MSstats$ComparisonResult$pvalue),] + groupComparisonPlots(data=test.MSstats$Volcano, type="VolcanoPlot", + width=12, height=12,dot.size = 2) - for (acc in quant.runLevel$accession) + # Otherwise it fails since the behaviour is undefined + if (nrow(contrast_mat) > 1) { - q<-which(quant.runLevel$accession==acc) - - # acc from MSstats might be a group e.g., "A;B" so - # we check the single leader protein in mzTab PRT$accession against both A and B - w<-which(PRT$accession %in% strsplit(acc, ";", fixed=TRUE)[[1]]) - - if (length(w) == 0) - { - # TODO: check why not all summarized protein accessions are in the mzTab. Minimum number of peptides/features different? - print(paste("Warning: ", acc, " not in mzTab but reported by MSstats")) - } - else - { - PRT[w, PRT_assay_cols] <- quant.runLevel[q, RL_assay_cols] - PRT[w, PRT_stdv_cols] <- quant.runLevel[q, RL_assay_cols] # we currently store same data in stdv and assay column - } + groupComparisonPlots(data=test.MSstats$ComparisonResult, type="Heatmap", + width=12, height=12,dot.size = 2) } - - write.table(MTD, mzTab_output, sep = "\t", quote=FALSE, row.names = FALSE, na = "null") - write("",file=mzTab_output,append=TRUE) - suppressWarnings(write.table(PRT_skipped, mzTab_output, sep = "\t", quote=FALSE, row.names = FALSE, append=TRUE, na = "null")) - suppressWarnings(write.table(PRT, mzTab_output, sep = "\t", col.names=FALSE, quote=FALSE, row.names = FALSE, append=TRUE, na = "null")) - write("",file=mzTab_output,append=TRUE) - suppressWarnings(write.table(PEP, mzTab_output, sep = "\t", quote=FALSE, row.names = FALSE, append=TRUE, na = "null")) - write("",file=mzTab_output,append=TRUE) - suppressWarnings(write.table(PSM, mzTab_output, sep = "\t", quote=FALSE, row.names = FALSE, append=TRUE, na = "null")) - } diff --git a/bin/msstats_tmt.R b/bin/msstats_tmt.R new file mode 100755 index 00000000..e6d578e7 --- /dev/null +++ b/bin/msstats_tmt.R @@ -0,0 +1,168 @@ +#!/usr/bin/env Rscript +args = commandArgs(trailingOnly=TRUE) +char_to_boolean = c("true"=TRUE, "false"=FALSE) + +usage <- "Rscript msstats_tmt.R input.csv [list of contrasts or 'pairwise'] [default control condition or '']... [normalization based reference channel]" +if (length(args)<1) { + print(usage) + stop("At least the first argument must be supplied (input csv).n", call.=FALSE) +} +if (length(args)<2) { + # contrasts + args[2] = "pairwise" +} +if (length(args)<3) { + # default control condition + args[3] = "" +} + +if (length(args)<4) { + # removeOneFeatProts + args[4] = FALSE +} +rmProtein_with1Feature = args[4] +if(typeof(rmProtein_with1Feature) == 'character'){ + rmProtein_with1Feature = char_to_boolean[rmProtein_with1Feature] +} + +if (length(args)<5) { + # use unique peptide + args[5] = TRUE +} +useUniquePeptide = args[5] +if(typeof(useUniquePeptide) == 'character'){ + useUniquePeptide = char_to_boolean[useUniquePeptide] +} + +if (length(args)<6) { + # remove the features that have 1 or 2 measurements within each Run. + args[6] = TRUE +} +rmPSM_withfewMea_withinRun = args[6] +if(typeof(rmPSM_withfewMea_withinRun) == 'character'){ + rmPSM_withfewMea_withinRun = char_to_boolean[rmPSM_withfewMea_withinRun] +} + +if (length(args)<7) { + # sum or max - when there are multiple measurements for certain feature in certain Run. + args[7] = 'sum' +} + +if (length(args)<8) { + # summarization methods to protein-level can be performed: "msstats(default)" + args[8] = "msstats" +} + +if (length(args)<9) { + # Global median normalization on peptide level data + args[9] = TRUE +} +global_norm = args[9] +if(typeof(global_norm) == 'character'){ + global_norm = char_to_boolean[global_norm] +} + +if (length(args)<10) { + # Remove norm channel + args[10] = TRUE +} +remove_norm_channel = args[10] +if(typeof(remove_norm_channel) == 'character'){ + remove_norm_channel = char_to_boolean[remove_norm_channel] +} + +if (length(args)<11) { + # default Reference channel based normalization between MS runs on protein level data. + # Reference Channel annotated by 'Norm' in Condition. + args[11] = TRUE +} +reference_norm = args[11] +if(typeof(reference_norm) == 'character'){ + reference_norm = char_to_boolean[reference_norm] +} + +csv_input <- args[1] +contrast_str <- args[2] +control_str <- args[3] + +require(MSstatsTMT) +# read dataframe into MSstatsTMT +data <- read.csv(csv_input) +quant <- OpenMStoMSstatsTMTFormat(data, useUniquePeptide=useUniquePeptide, rmPSM_withfewMea_withinRun=rmPSM_withfewMea_withinRun, + rmProtein_with1Feature=rmProtein_with1Feature +) + +# protein summarization +processed.quant <- proteinSummarization(quant, method=args[8],remove_empty_channel=TRUE, global_norm=global_norm, + reference_norm=reference_norm, remove_norm_channel=remove_norm_channel +) + +dataProcessPlotsTMT(processed.quant, "ProfilePlot", width=12, height=12, which.Protein="all") +dataProcessPlotsTMT(processed.quant, "QCPlot", width=12, height=12, which.Protein="allonly") + +lvls <- levels(as.factor(processed.quant$ProteinLevelData$Condition)) +l <- length(lvls) +if (l == 1) +{ + print("Only one condition found. No contrasts to be tested. If this is not the case, please check your experimental design.") +} else { + if (contrast_str == "pairwise") + { + if (control_str == "") + { + contrast_mat <- matrix(nrow = l * (l-1) / 2, ncol = l, dimnames=list(Contrasts=rep(NA, l * (l-1) / 2), Levels=lvls)) + c <- 1 + for (i in 1:(l-1)) + { + for (j in (i+1):l) + { + comparison <- rep(0,l) + comparison[i] <- 1 + comparison[j] <- -1 + contrast_mat[c,] <- comparison + rownames(contrast_mat)[c] <- paste0(lvls[i],"-",lvls[j]) + c <- c+1 + } + } + } else { + control <- which(as.character(lvls) == control_str) + if (length(control) == 0) + { + stop("Control condition not part of found levels.n", call.=FALSE) + } + + contrast_mat <- matrix(nrow = l-1, ncol = l, dimnames=list(Contrasts=rep(NA, l-1),Levels=lvls)) + c <- 1 + for (j in setdiff(1:l,control)) + { + comparison <- rep(0,l) + comparison[i] <- -1 + comparison[j] <- 1 + contrast_mat[c,] <- comparison + rownames(contrast_mat)[c] <- paste0(lvls[i],"-",lvls[j]) + c <- c+1 + } + } + } else { + contrast_lst <- unlist(strsplit(contrast_str,";")) + contrast_mat <- make_contrasts(contrast_lst, lvls) + } + print ("Contrasts to be tested:") + print (contrast_mat) + #TODO allow for user specified contrasts + test.MSstatsTMT <- groupComparisonTMT(contrast.matrix=contrast_mat, data=processed.quant) + + #TODO allow manual input (e.g. proteins of interest) + write.table(test.MSstatsTMT$ComparisonResult, file=paste0("msstatsiso_results.csv"), quote=FALSE, sep='\t', row.names = FALSE) + + #write comparison to CSV (one CSV per contrast) + # writeComparisonToCSV <- function(DF) + # { + # write.table(DF, file=paste0("comparison_",unique(DF$Label),".csv"), quote=FALSE, sep='\t', row.names = FALSE) + # return(DF) + # } + # ComparisonResultSplit <- split(test.MSstatsTMT$ComparisonResult, test.MSstatsTMT$ComparisonResult$Label) + # for(i in 1:length(ComparisonResultSplit)){ + # writeComparisonToCSV(ComparisonResultSplit[[i]]) + # } +} diff --git a/conf/test_dia.config b/conf/test_dia.config index 8ab5b74b..0a9f2253 100644 --- a/conf/test_dia.config +++ b/conf/test_dia.config @@ -22,7 +22,7 @@ params { outdir = './results_dia' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/quantms/testdata/lfq_ci/PXD026600/PXD026600.sdrf.tsv' + input = 'https://raw.githubusercontent.com/daichengxin/quantms/dev/assets/PXD026600.sdrf.tsv' database = 'ftp://massive.ucsd.edu/MSV000087597/sequence/REF_EColi_K12_UPS1_combined.fasta' min_pr_mz = 350 max_pr_mz = 950 diff --git a/modules/local/diannconvert/main.nf b/modules/local/diannconvert/main.nf index af03e3ef..f21461e9 100644 --- a/modules/local/diannconvert/main.nf +++ b/modules/local/diannconvert/main.nf @@ -24,7 +24,7 @@ process DIANNCONVERT { diann_convert.py convert \\ --diann_report ${report} \\ --exp_design ${exp_design} \\ - > trans_to_msstats.log + |& tee trans_to_msstats.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/diannsearch/main.nf b/modules/local/diannsearch/main.nf index d10dda3b..7bbc56ef 100644 --- a/modules/local/diannsearch/main.nf +++ b/modules/local/diannsearch/main.nf @@ -53,7 +53,7 @@ process DIANNSEARCH { ${normalize} \\ --out diann_report.tsv \\ --verbose $params.diann_debug \\ - > diann.log + |& tee diann.log cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/generate_diann_cfg/main.nf b/modules/local/generate_diann_cfg/main.nf index 26a8e4f9..54e34539 100644 --- a/modules/local/generate_diann_cfg/main.nf +++ b/modules/local/generate_diann_cfg/main.nf @@ -29,7 +29,7 @@ process GENERATE_DIANN_CFG { --precursor_tolerence_unit ${meta.precursormasstoleranceunit} \\ --fragment_tolerence ${meta.fragmentmasstolerance} \\ --fragment_tolerence_unit ${meta.fragmentmasstoleranceunit} \\ - > GENERATE_DIANN_CFG.log + |& tee GENERATE_DIANN_CFG.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/librarygeneration/main.nf b/modules/local/librarygeneration/main.nf index d48cf74d..6191a5ff 100644 --- a/modules/local/librarygeneration/main.nf +++ b/modules/local/librarygeneration/main.nf @@ -46,7 +46,7 @@ process LIBRARYGENERATION { --threads ${task.cpus} \\ --predictor \\ --verbose $params.diann_debug \\ - > diann.log + |& tee diann.log cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/msstats/main.nf b/modules/local/msstats/main.nf index b1d319aa..905012ce 100644 --- a/modules/local/msstats/main.nf +++ b/modules/local/msstats/main.nf @@ -10,25 +10,29 @@ process MSSTATS { input: path out_msstats - path out_mztab_msstats output: // The generation of the PDFs from MSstats are very unstable, especially with auto-contrasts. // And users can easily fix anything based on the csv and the included script -> make optional path "*.pdf" optional true - path "*.mzTab", optional: true, emit: msstats_mztab path "*.csv", emit: msstats_csv path "*.log", emit: log path "versions.yml" , emit: version script: def args = task.ext.args ?: '' + ref_con = params.ref_condition ?: "" """ msstats_plfq.R \\ ${out_msstats} \\ - ${out_mztab_msstats} \\ - ${args} \\ + ${params.contrasts} \\ + "${ref_con}" \\ + ${params.msstats_remove_one_feat_prot} \\ + ${params.msstatslfq_removeFewMeasurements} \\ + ${params.msstatslfq_feature_subset_protein} \\ + ${params.msstatslfq_quant_summary_method} \\ + $args \\ > msstats.log \\ || echo "Optional MSstats step failed. Please check logs and re-run or do a manual statistical analysis." diff --git a/modules/local/msstats/meta.yml b/modules/local/msstats/meta.yml index 860b12fc..98a69c2d 100644 --- a/modules/local/msstats/meta.yml +++ b/modules/local/msstats/meta.yml @@ -14,15 +14,7 @@ input: type: file description: MSstats input file with analysis results for statistical downstream analysis in MSstats. pattern: "out_msstats.csv" - - out_mztab_msstats: - type: file - description: mzTab file with analysis results - pattern: "out.mzTab" output: - - msstats_mztab: - type: file - description: mzTab file with MSstats quantification results - pattern: "*.mzTab" - msstats_csv: type: file description: csv file with results of statistical testing diff --git a/modules/local/msstatstmt/main.nf b/modules/local/msstatstmt/main.nf new file mode 100644 index 00000000..5b0d3229 --- /dev/null +++ b/modules/local/msstatstmt/main.nf @@ -0,0 +1,49 @@ +process MSSTATSTMT { + label 'process_medium' + + conda (params.enable_conda ? "bioconda::bioconductor-msstatstmt=2.2.0" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/bioconductor-msstatstmt:2.2.0--r41hdfd78af_0" + } else { + container "quay.io/biocontainers/bioconductor-msstatstmt:2.2.0--r41hdfd78af_0" + } + + input: + path out_msstats_tmt + + output: + // The generation of the PDFs from MSstatsTMT are very unstable, especially with auto-contrasts. + // And users can easily fix anything based on the csv and the included script -> make optional + path "*.pdf" optional true + path "*.csv", emit: msstats_csv + path "*.log" + path "versions.yml" , emit: version + + script: + def args = task.ext.args ?: '' + ref_con = params.ref_condition ?: "" + + """ + msstats_tmt.R \\ + ${out_msstats_tmt} \\ + ${params.contrasts} \\ + "${ref_con}" \\ + ${params.msstats_remove_one_feat_prot} \\ + ${params.msstatsiso_useunique_peptide} \\ + ${params.msstatsiso_rmpsm_withfewmea_withinrun} \\ + ${params.msstatsiso_summaryformultiple_psm} \\ + ${params.msstatsiso_summarization_method} \\ + ${params.msstatsiso_global_norm} \\ + ${params.msstatsiso_remove_norm_channel} \\ + ${params.msstatsiso_reference_normalization} \\ + $args \\ + > msstats_tmt.log \\ + || echo "Optional MSstatsTMT step failed. Please check logs and re-run or do a manual statistical analysis." + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + MSstatsTMT: \$(echo "2.2.0") + END_VERSIONS + """ +} diff --git a/modules/local/msstatstmt/meta.yml b/modules/local/msstatstmt/meta.yml new file mode 100644 index 00000000..610f5e19 --- /dev/null +++ b/modules/local/msstatstmt/meta.yml @@ -0,0 +1,31 @@ +name: MSSTATSTMT +description: | + A module to performance proteomics statistical analysis with tandem mass tag (TMT) labeling using MSstatsTMT tool. + - MSstatsTMT + - downStream analysis +tools: + - MSstatsTMT: + description: | + MSstatsTMT is an R-based package for detecting differentially abundant proteins in shotgun mass spectrometry-based proteomic experiments with tandem mass tag (TMT) labeling + homepage: https://github.com/Vitek-Lab/MSstatsTMT + documentation: https://www.bioconductor.org/packages/release/bioc/vignettes/MSstatsTMT/inst/doc/MSstatsTMT.html +input: + - out_msstats: + type: file + description: MSstats input file with analysis results for statistical downstream analysis in MSstatsTMT. + pattern: "out_msstats.csv" +output: + - msstats_csv: + type: file + description: csv file with results of statistical testing + pattern: "*.csv" + - version: + type: file + description: File containing software version + pattern: "versions.yml" + - log: + type: file + description: log file + pattern: "*.log" +authors: + - "@daichengxin" diff --git a/modules/local/openms/consensusid/main.nf b/modules/local/openms/consensusid/main.nf index 752acb65..936e2524 100644 --- a/modules/local/openms/consensusid/main.nf +++ b/modules/local/openms/consensusid/main.nf @@ -32,7 +32,7 @@ process CONSENSUSID { -filter:considered_hits $params.consensusid_considered_top_hits \\ -debug $params.consensusid_debug \\ $args \\ - > ${meta.id}_consensusID.log + |& tee ${meta.id}_consensusID.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/openms/decoydatabase/main.nf b/modules/local/openms/decoydatabase/main.nf index 19dd393a..bdd7ea81 100644 --- a/modules/local/openms/decoydatabase/main.nf +++ b/modules/local/openms/decoydatabase/main.nf @@ -29,7 +29,7 @@ process DECOYDATABASE { -shuffle_sequence_identity_threshold $params.shuffle_sequence_identity_threshold \\ -debug $params.decoydatabase_debug \\ $args \\ - > ${db_for_decoy.baseName}_decoy_database.log + |& tee ${db_for_decoy.baseName}_decoy_database.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/openms/epifany/main.nf b/modules/local/openms/epifany/main.nf index 98592664..e968af3b 100644 --- a/modules/local/openms/epifany/main.nf +++ b/modules/local/openms/epifany/main.nf @@ -32,7 +32,7 @@ process EPIFANY { -algorithm:top_PSMs $params.top_PSMs \\ -out ${consus_file.baseName}_epi.consensusXML \\ $args \\ - > ${consus_file.baseName}_inference.log + |& tee ${consus_file.baseName}_inference.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/openms/extractpsmfeatures/main.nf b/modules/local/openms/extractpsmfeatures/main.nf index 3f6d3dc6..2bc7fa1f 100644 --- a/modules/local/openms/extractpsmfeatures/main.nf +++ b/modules/local/openms/extractpsmfeatures/main.nf @@ -26,7 +26,7 @@ process EXTRACTPSMFEATURES { -out ${id_file.baseName}_feat.idXML \\ -threads $task.cpus \\ $args \\ - > ${id_file.baseName}_extract_psm_feature.log + |& tee ${id_file.baseName}_extract_psm_feature.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/openms/falsediscoveryrate/main.nf b/modules/local/openms/falsediscoveryrate/main.nf index 43d53816..9c0d6580 100644 --- a/modules/local/openms/falsediscoveryrate/main.nf +++ b/modules/local/openms/falsediscoveryrate/main.nf @@ -29,7 +29,7 @@ process FALSEDISCOVERYRATE { -algorithm:add_decoy_peptides \\ -algorithm:add_decoy_proteins \\ $args \\ - > ${id_file.baseName}_fdr.log + |& tee ${id_file.baseName}_fdr.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/openms/filemerge/main.nf b/modules/local/openms/filemerge/main.nf index 99bbd1d2..bbb433bb 100644 --- a/modules/local/openms/filemerge/main.nf +++ b/modules/local/openms/filemerge/main.nf @@ -28,7 +28,7 @@ process FILEMERGE { -threads $task.cpus \\ -out ID_mapper_merge.consensusXML \\ $args \\ - > ID_mapper_merge.log + |& tee ID_mapper_merge.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/openms/idconflictresolver/main.nf b/modules/local/openms/idconflictresolver/main.nf index 6857104c..45149e7e 100644 --- a/modules/local/openms/idconflictresolver/main.nf +++ b/modules/local/openms/idconflictresolver/main.nf @@ -24,7 +24,7 @@ process IDCONFLICTRESOLVER { -threads $task.cpus \\ -out ${consus_file.baseName}_resconf.consensusXML \\ $args \\ - > ${consus_file.baseName}_resconf.log + |& tee ${consus_file.baseName}_resconf.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/openms/idfilter/main.nf b/modules/local/openms/idfilter/main.nf index d4763917..9f6d02ba 100644 --- a/modules/local/openms/idfilter/main.nf +++ b/modules/local/openms/idfilter/main.nf @@ -27,7 +27,7 @@ process IDFILTER { -out ${id_file.baseName}_filter$suffix \\ -threads $task.cpus \\ $args \\ - > ${id_file.baseName}_idfilter.log + |& tee ${id_file.baseName}_idfilter.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/openms/idmapper/main.nf b/modules/local/openms/idmapper/main.nf index 0d3ff125..246b937a 100644 --- a/modules/local/openms/idmapper/main.nf +++ b/modules/local/openms/idmapper/main.nf @@ -28,7 +28,7 @@ process IDMAPPER { -threads $task.cpus \\ -out ${id_file.baseName}_map.consensusXML \\ $args \\ - > ${id_file.baseName}_map.log + |& tee ${id_file.baseName}_map.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/openms/idpep/main.nf b/modules/local/openms/idpep/main.nf index c6185288..de9e23b6 100644 --- a/modules/local/openms/idpep/main.nf +++ b/modules/local/openms/idpep/main.nf @@ -25,7 +25,7 @@ process IDPEP { -fit_algorithm:outlier_handling $params.outlier_handling \\ -threads ${task.cpus} \\ $args \\ - > ${id_file.baseName}_idpep.log + |& tee ${id_file.baseName}_idpep.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/openms/idscoreswitcher/main.nf b/modules/local/openms/idscoreswitcher/main.nf index a6644af5..6c3e149e 100644 --- a/modules/local/openms/idscoreswitcher/main.nf +++ b/modules/local/openms/idscoreswitcher/main.nf @@ -27,7 +27,7 @@ process IDSCORESWITCHER { -threads $task.cpus \\ -new_score ${new_score} \\ $args \\ - > ${id_file.baseName}_switch.log + |& tee ${id_file.baseName}_switch.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/openms/indexpeptides/main.nf b/modules/local/openms/indexpeptides/main.nf index f733d838..609043e5 100644 --- a/modules/local/openms/indexpeptides/main.nf +++ b/modules/local/openms/indexpeptides/main.nf @@ -52,7 +52,7 @@ process INDEXPEPTIDES { ${il} \\ ${allow_um} \\ $args \\ - > ${id_file.baseName}_index_peptides.log + |& tee ${id_file.baseName}_index_peptides.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/openms/isobaricanalyzer/main.nf b/modules/local/openms/isobaricanalyzer/main.nf index 77599888..5847ce2d 100644 --- a/modules/local/openms/isobaricanalyzer/main.nf +++ b/modules/local/openms/isobaricanalyzer/main.nf @@ -40,7 +40,7 @@ process ISOBARICANALYZER { -${meta.labelling_type}:reference_channel $params.reference_channel \\ -out ${mzml_file.baseName}_iso.consensusXML \\ $args \\ - > ${mzml_file.baseName}_isob.log + |& tee ${mzml_file.baseName}_isob.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/openms/msstatsconverter/main.nf b/modules/local/openms/msstatsconverter/main.nf index 59b17411..04433993 100644 --- a/modules/local/openms/msstatsconverter/main.nf +++ b/modules/local/openms/msstatsconverter/main.nf @@ -26,7 +26,7 @@ process MSSTATSCONVERTER { -method ${quant_method} \\ -out out_msstats.csv \\ $args \\ - > MSstatsConverter.log + |& tee MSstatsConverter.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/openms/mzmlindexing/main.nf b/modules/local/openms/mzmlindexing/main.nf index 62fff846..0da4e445 100644 --- a/modules/local/openms/mzmlindexing/main.nf +++ b/modules/local/openms/mzmlindexing/main.nf @@ -21,7 +21,7 @@ process MZMLINDEXING { """ mkdir out - FileConverter -in ${mzmlfile} -out out/${mzmlfile.baseName}.mzML > ${mzmlfile.baseName}_mzmlindexing.log + FileConverter -in ${mzmlfile} -out out/${mzmlfile.baseName}.mzML |& tee ${mzmlfile.baseName}_mzmlindexing.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/openms/openmspeakpicker/main.nf b/modules/local/openms/openmspeakpicker/main.nf index 9346abfb..f067fac6 100644 --- a/modules/local/openms/openmspeakpicker/main.nf +++ b/modules/local/openms/openmspeakpicker/main.nf @@ -31,7 +31,7 @@ process OPENMSPEAKPICKER { -processOption ${in_mem} \\ ${lvls} \\ $args \\ - > ${mzml_file.baseName}_pp.log + |& tee ${mzml_file.baseName}_pp.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/openms/proteininference/main.nf b/modules/local/openms/proteininference/main.nf index 1ed84361..807190e5 100644 --- a/modules/local/openms/proteininference/main.nf +++ b/modules/local/openms/proteininference/main.nf @@ -33,7 +33,7 @@ process PROTEININFERENCE { -Algorithm:min_peptides_per_protein $params.min_peptides_per_protein \\ -out ${consus_file.baseName}_epi.consensusXML \\ $args \\ - > ${consus_file.baseName}_inference.log + |& tee ${consus_file.baseName}_inference.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/openms/proteinquantifier/main.nf b/modules/local/openms/proteinquantifier/main.nf index 49ab74c5..3015135f 100644 --- a/modules/local/openms/proteinquantifier/main.nf +++ b/modules/local/openms/proteinquantifier/main.nf @@ -39,7 +39,7 @@ process PROTEINQUANTIFIER { -threads $task.cpus \\ ${normalize} \\ $args \\ - > pro_quant.log + |& tee pro_quant.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/openms/proteomicslfq/main.nf b/modules/local/openms/proteomicslfq/main.nf index 1aff7dca..1cbc1656 100644 --- a/modules/local/openms/proteomicslfq/main.nf +++ b/modules/local/openms/proteomicslfq/main.nf @@ -54,7 +54,7 @@ process PROTEOMICSLFQ { -out_cxml out.consensusXML \\ -proteinFDR ${params.protein_level_fdr_cutoff} \\ $args \\ - > proteomicslfq.log + |& tee proteomicslfq.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/openms/thirdparty/luciphoradapter/main.nf b/modules/local/openms/thirdparty/luciphoradapter/main.nf index 9ba3cd45..bea9a955 100644 --- a/modules/local/openms/thirdparty/luciphoradapter/main.nf +++ b/modules/local/openms/thirdparty/luciphoradapter/main.nf @@ -45,7 +45,7 @@ process LUCIPHORADAPTER { -max_charge_state $params.max_precursor_charge \\ -max_peptide_length $params.max_peptide_length \\ $args \\ - > ${id_file.baseName}_luciphor.log + |& tee ${id_file.baseName}_luciphor.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/openms/thirdparty/percolator/main.nf b/modules/local/openms/thirdparty/percolator/main.nf index 798563fc..d3a1d22d 100644 --- a/modules/local/openms/thirdparty/percolator/main.nf +++ b/modules/local/openms/thirdparty/percolator/main.nf @@ -28,7 +28,7 @@ process PERCOLATOR { -post_processing_tdc \\ -score_type pep \\ $args \\ - > ${id_file.baseName}_percolator.log + |& tee ${id_file.baseName}_percolator.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/openms/thirdparty/searchenginecomet/main.nf b/modules/local/openms/thirdparty/searchenginecomet/main.nf index 9eb86a1a..cd396e30 100644 --- a/modules/local/openms/thirdparty/searchenginecomet/main.nf +++ b/modules/local/openms/thirdparty/searchenginecomet/main.nf @@ -106,7 +106,7 @@ process SEARCHENGINECOMET { -debug $params.db_debug \\ -force \\ $args \\ - > ${mzml_file.baseName}_comet.log + |& tee ${mzml_file.baseName}_comet.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/openms/thirdparty/searchenginemsgf/main.nf b/modules/local/openms/thirdparty/searchenginemsgf/main.nf index 07eb51e0..e69d1c04 100644 --- a/modules/local/openms/thirdparty/searchenginemsgf/main.nf +++ b/modules/local/openms/thirdparty/searchenginemsgf/main.nf @@ -84,7 +84,7 @@ process SEARCHENGINEMSGF { -PeptideIndexing:unmatched_action ${params.unmatched_action} \\ -debug $params.db_debug \\ $args \\ - > ${mzml_file.baseName}_msgf.log + |& tee ${mzml_file.baseName}_msgf.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf index 7a5b6c66..0bb80706 100644 --- a/modules/local/samplesheet_check.nf +++ b/modules/local/samplesheet_check.nf @@ -19,7 +19,7 @@ process SAMPLESHEET_CHECK { def args = task.ext.args ?: '' """ - check_samplesheet.py "${input_file}" ${is_sdrf} --CHECK_MS > input_check.log + check_samplesheet.py "${input_file}" ${is_sdrf} --CHECK_MS |& tee input_check.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/sdrfparsing/main.nf b/modules/local/sdrfparsing/main.nf index 767b3350..7941b2e9 100644 --- a/modules/local/sdrfparsing/main.nf +++ b/modules/local/sdrfparsing/main.nf @@ -24,7 +24,7 @@ process SDRFPARSING { ## -l for legacy behavior to always add sample columns ## TODO Update the sdrf-pipelines to dynamic print versions - parse_sdrf convert-openms -t2 -l -s ${sdrf} > sdrf_parsing.log + parse_sdrf convert-openms -t2 -l -s ${sdrf} |& tee sdrf_parsing.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/thermorawfileparser/main.nf b/modules/local/thermorawfileparser/main.nf index eb6faceb..64794222 100644 --- a/modules/local/thermorawfileparser/main.nf +++ b/modules/local/thermorawfileparser/main.nf @@ -23,7 +23,7 @@ process THERMORAWFILEPARSER { def prefix = task.ext.prefix ?: "${meta.id}" """ - ThermoRawFileParser.sh -i=${rawfile} -f=2 -o=./ > ${rawfile.baseName}_conversion.log + ThermoRawFileParser.sh -i=${rawfile} -f=2 -o=./ |& tee ${rawfile.baseName}_conversion.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/nextflow.config b/nextflow.config index dc226019..b040bf8b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -154,10 +154,25 @@ params { max_fr_mz = null diann_normalize = true - // MSstats - skip_post_msstats = false - ref_condition = null - contrasts = 'pairwise' + // MSstats general options + msstats_remove_one_feat_prot = true + ref_condition = null + contrasts = 'pairwise' + skip_post_msstats = false + + // MSstats LFQ options + msstatslfq_removeFewMeasurements = true + msstatslfq_feature_subset_protein = 'top3' + msstatslfq_quant_summary_method = 'TMP' + + // MSstats ISO options + msstatsiso_useunique_peptide = true + msstatsiso_rmpsm_withfewmea_withinrun = true + msstatsiso_summaryformultiple_psm = 'sum' + msstatsiso_summarization_method = 'msstats' + msstatsiso_global_norm = true + msstatsiso_remove_norm_channel = true + msstatsiso_reference_normalization = true // PTXQC enable_qc = false diff --git a/nextflow_schema.json b/nextflow_schema.json index c5f081dc..ee1b6ed7 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input"], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -603,9 +603,24 @@ "protein_quantification_dda": { "title": "Protein Quantification (DDA)", "type": "object", - "description": "General protein quantification settings for both LFQ and Isobaric labelling.", + "description": "General protein quantification settings for both LFQ and isobaric labelling.", "default": "", "properties": { + "labelling_type": { + "type": "string", + "description": "Specify the labelling method that was used. Will be ignored if SDRF was given but is mandatory otherwise", + "fa_icon": "fas fa-font", + "help_text": "Quantification method used in the experiment.", + "enum": [ + "label free sample", + "itraq4plex", + "itraq8plex", + "tmt6plex", + "tmt10plex", + "tmt11plex", + "tmt16plex" + ] + }, "top": { "type": "integer", "description": "Calculate protein abundance from this number of proteotypic peptides (most abundant first; '0' for all, Default 3)", @@ -718,21 +733,6 @@ "description": "Extracts and normalizes labeling information", "default": "", "properties": { - "labelling_type": { - "type": "string", - "description": "Isobaric labelling method. Will be ignored if SDRF was given.", - "fa_icon": "fas fa-font", - "help_text": "Isobaric Quantification method used in the experiment.", - "enum": [ - "label free sample", - "itraq4plex", - "itraq8plex", - "tmt6plex", - "tmt10plex", - "tmt11plex", - "tmt16plex" - ] - }, "select_activation": { "type": "string", "description": "Operate only on MSn scans where any of its precursors features a certain activation method. Set to empty to disable.", @@ -863,7 +863,7 @@ "properties": { "skip_post_msstats": { "type": "boolean", - "description": "Skip MSstats for statistical post-processing?", + "description": "Skip MSstats/MSstatsTMT for statistical post-processing?", "fa_icon": "fas fa-forward" }, "ref_condition": { @@ -881,6 +881,76 @@ "description": "Also create an output in Triqler's format for an alternative manual post-processing with that tool", "default": false, "fa_icon": "far fa-check-square" + }, + "msstatslfq_feature_subset_protein": { + "type": "string", + "description": "Which features to use for quantification per protein: 'top3' or 'highQuality' which removes outliers only", + "fa_icon": "far fa-list-ol", + "default": "top3", + "enum": ["top3", "highQuality"] + }, + "msstatslfq_quant_summary_method": { + "type": "string", + "description": "which summary method to use: 'TMP' (Tukey's median polish) or 'linear' (linear mixed model)", + "fa_icon": "far fa-list-ol", + "default": "TMP", + "enum": ["TMP", "linear"] + }, + "msstats_remove_one_feat_prot": { + "type": "boolean", + "description": "Omit proteins with only one quantified feature?", + "fa_icon": "far fa-check-square", + "default": true + }, + "msstatslfq_removeFewMeasurements": { + "type": "boolean", + "description": "Keep features with only one or two measurements across runs?", + "fa_icon": "far fa-check-square", + "default": true + }, + "msstatsiso_useunique_peptide": { + "type": "boolean", + "description": "Use unique peptide for each protein", + "fa_icon": "far fa-check-square", + "default": true + }, + "msstatsiso_rmpsm_withfewmea_withinrun": { + "type": "boolean", + "description": "Remove the features that have 1 or 2 measurements within each run", + "fa_icon": "far fa-check-square", + "default": true + }, + "msstatsiso_summaryformultiple_psm": { + "type": "string", + "description": "select the feature with the largest summmation or maximal value", + "fa_icon": "far fa-list-ol", + "default": "sum", + "enum": ["sum", "max"] + }, + "msstatsiso_summarization_method": { + "type": "string", + "description": "summarization methods to protein-level can be perfomed", + "fa_icon": "far fa-list-ol", + "default": "msstats", + "enum": ["msstats", "MedianPolish", "Median", "LogSum"] + }, + "msstatsiso_global_norm": { + "type": "boolean", + "description": "Reference channel based normalization between MS runs on protein level data?", + "fa_icon": "far fa-check-square", + "default": true + }, + "msstatsiso_remove_norm_channel": { + "type": "boolean", + "description": "Remove 'Norm' channels from protein level data", + "fa_icon": "far fa-check-square", + "default": true + }, + "msstatsiso_reference_normalization": { + "type": "boolean", + "description": "Reference channel based normalization between MS runs on protein level data", + "fa_icon": "far fa-check-square", + "default": true } }, "fa_icon": "fab fa-r-project" diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index d044d9fd..101259b4 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -11,13 +11,18 @@ workflow INPUT_CHECK { main: if (input_file.toString().toLowerCase().contains("sdrf")) { is_sdrf = true - } else{ + } else { is_sdrf = false + if (!params.labelling_type || !params.acquisition_method) + { + log.error "If no SDRF was given, specifying --labelling_type and --acquisition_method is mandatory." + exit 1 + } } SAMPLESHEET_CHECK ( input_file, is_sdrf ) emit: ch_input_file = SAMPLESHEET_CHECK.out.checked_file is_sdrf = is_sdrf - versions = SAMPLESHEET_CHECK.out.versions + versions = SAMPLESHEET_CHECK.out.versions } diff --git a/workflows/dia.nf b/workflows/dia.nf index 14497c8e..c5fa3a53 100644 --- a/workflows/dia.nf +++ b/workflows/dia.nf @@ -11,6 +11,7 @@ include { DIANNSEARCH } from '../modules/local/diannsearch/main' include { GENERATE_DIANN_CFG as DIANNCFG } from '../modules/local/generate_diann_cfg/main' include { DIANNCONVERT } from '../modules/local/diannconvert/main' include { LIBRARYGENERATION } from '../modules/local/librarygeneration/main' +include { MSSTATS } from '../modules/local/msstats/main' // // SUBWORKFLOWS: Consisting of a mix of local and nf-core/modules @@ -53,11 +54,22 @@ workflow DIA { DIANNCONVERT(DIANNSEARCH.out.report, ch_expdesign) versions = ch_software_versions + // + // MODULE: MSSTATS + ch_msstats_out = Channel.empty() + if(!params.skip_post_msstats){ + MSSTATS(DIANNCONVERT.out.out_msstats) + ch_msstats_out = MSSTATS.out.msstats_csv + ch_software_versions = ch_software_versions.mix(MSSTATS.out.version.ifEmpty(null)) + } + emit: versions = versions diann_report = DIANNSEARCH.out.report - out_msstats = DIANNCONVERT.out.out_msstats + msstats_csv = DIANNCONVERT.out.out_msstats out_triqler = DIANNCONVERT.out.out_triqler + msstats_out = ch_msstats_out + } /* diff --git a/workflows/lfq.nf b/workflows/lfq.nf index 3f558ec1..603b4b70 100644 --- a/workflows/lfq.nf +++ b/workflows/lfq.nf @@ -55,9 +55,12 @@ workflow LFQ { ) ch_software_versions = ch_software_versions.mix(PROTEOMICSLFQ.out.version.ifEmpty(null)) + // + // MODULE: MSSTATS + // ch_msstats_out = Channel.empty() if(!params.skip_post_msstats && params.quantification_method == "feature_intensity"){ - MSSTATS(PROTEOMICSLFQ.out.out_msstats, PROTEOMICSLFQ.out.out_mztab) + MSSTATS(PROTEOMICSLFQ.out.out_msstats) ch_msstats_out = MSSTATS.out.msstats_csv ch_software_versions = ch_software_versions.mix(MSSTATS.out.version.ifEmpty(null)) } diff --git a/workflows/tmt.nf b/workflows/tmt.nf index 12676d18..64a57c93 100644 --- a/workflows/tmt.nf +++ b/workflows/tmt.nf @@ -8,6 +8,7 @@ // MODULES: Local to the pipeline // include { FILEMERGE } from '../modules/local/openms/filemerge/main' +include { MSSTATSTMT } from '../modules/local/msstatstmt/main' // // SUBWORKFLOWS: Consisting of a mix of local and nf-core/modules @@ -62,6 +63,17 @@ workflow TMT { PROTEINQUANT(PROTEININFERENCE.out.epi_idfilter, ch_expdesign) ch_software_versions = ch_software_versions.mix(PROTEINQUANT.out.version.ifEmpty(null)) + // + // MODULE: MSSTATSTMT + // + ch_msstats_out = Channel.empty() + if(!params.skip_post_msstats){ + MSSTATSTMT(PROTEINQUANT.out.msstats_csv) + ch_msstats_out = MSSTATSTMT.out.msstats_csv + ch_software_versions = ch_software_versions.mix(MSSTATSTMT.out.version.ifEmpty(null)) + } + + ID.out.psmrescoring_results .map { it -> it[1] } .set { ch_pmultiqc_ids } @@ -69,5 +81,7 @@ workflow TMT { emit: ch_pmultiqc_ids = ch_pmultiqc_ids final_result = PROTEINQUANT.out.out_mztab + msstats_input = PROTEINQUANT.out.msstats_csv + msstats_out = ch_msstats_out versions = ch_software_versions }