From 11e8efd9ef11348169acc9ffb19893eff8485930 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 29 Apr 2022 11:44:43 +0100 Subject: [PATCH 01/47] PXD000001.sdrf.tsv removed from assets --- assets/PXD000001.sdrf.tsv | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 assets/PXD000001.sdrf.tsv diff --git a/assets/PXD000001.sdrf.tsv b/assets/PXD000001.sdrf.tsv deleted file mode 100644 index f9ca1631..00000000 --- a/assets/PXD000001.sdrf.tsv +++ /dev/null @@ -1,7 +0,0 @@ -Source Name Characteristics[organism] Characteristics[organism part] Characteristics[age] Characteristics[ancestry category] Characteristics[developmental stage] Characteristics[cell line] Characteristics[cell type] Characteristics[sex] Characteristics[mass] Characteristics[spiked compound] Characteristics[spiked compound 2] Characteristics[spiked compound 3] Characteristics[spiked compound 4] Characteristics[disease] Characteristics[biological replicate] Material Type assay name technology type comment[data file] comment[file uri] comment[technical replicate] comment[fraction identifier] comment[label] comment[instrument] comment[modification parameters] comment[modification parameters] comment[modification parameters] comment[modification parameters] comment[cleavage agent details] comment[dissociation method] comment[precursor mass tolerance] comment[fragment mass tolerance] Factor Value[spiked compound] Factor Value[spiked compound] Factor Value[spiked compound] Factor Value[spiked compound] -Sample 1 Dickeya chrysanthemi whole plant not available not available not available not applicable not available not applicable 1 SP=Yeast;CT=protein;AC=P00924;QY=10 SP=BOVIN;CT=protein;AC=P02769;QY=1 SP=RABIT;CT=protein;AC=P00489;QY=2 SP=BOVIN;CT=protein;AC=P62894;QY=1 not available 1 cell run 1 proteomic profiling by mass spectrometry TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.raw ftp://ftp.pride.ebi.ac.uk/pride-archive/2012/03/PXD000001/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.raw 1 1 TMT126 NT=LTQ Orbitrap Velos;AC=MS:1001742 NT=Oxidation;MT=Variable;TA=M;AC=Unimod:35 NT=Methylthio;TA=C;MT=fixed;AC=UNIMOD:39 NT=TMT6plex;TA=K;MT=Fixed;AC=UNIMOD:737 NT=TMT6plex;PP=Any N-term;MT=Fixed;AC=UNIMOD:737 AC=MS:1001313;NT=Trypsin NT=HCD;AC=PRIDE:0000590 not available not available SP=Yeast;CT=protein;AC=P00924;QY=10 SP=BOVIN;CT=protein;AC=P02769;QY=1 SP=RABIT;CT=protein;AC=P00489;QY=2 SP=BOVIN;CT=protein;AC=P62894;QY=1 -Sample 2 Dickeya chrysanthemi whole plant not available not available not available not applicable not available not applicable 1 SP=Yeast;CT=protein;AC=P00924;QY=5 SP=BOVIN;CT=protein;AC=P02769;QY=2.5 SP=RABIT;CT=protein;AC=P00489;QY=2 SP=BOVIN;CT=protein;AC=P62894;QY=1 not available 1 cell run 1 proteomic profiling by mass spectrometry TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.raw ftp://ftp.pride.ebi.ac.uk/pride-archive/2012/03/PXD000001/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.raw 1 1 TMT127 NT=LTQ Orbitrap Velos;AC=MS:1001742 NT=Oxidation;MT=Variable;TA=M;AC=Unimod:35 NT=Methylthio;TA=C;MT=fixed;AC=UNIMOD:39 NT=TMT6plex;TA=K;MT=Fixed;AC=UNIMOD:737 NT=TMT6plex;PP=Any N-term;MT=Fixed;AC=UNIMOD:737 AC=MS:1001313;NT=Trypsin NT=HCD;AC=PRIDE:0000590 not available not available SP=Yeast;CT=protein;AC=P00924;QY=5 SP=BOVIN;CT=protein;AC=P02769;QY=2.5 SP=RABIT;CT=protein;AC=P00489;QY=2 SP=BOVIN;CT=protein;AC=P62894;QY=1 -Sample 3 Dickeya chrysanthemi whole plant not available not available not available not applicable not available not applicable 1 SP=Yeast;CT=protein;AC=P00924;QY=2.5 SP=BOVIN;CT=protein;AC=P02769;QY=5 SP=RABIT;CT=protein;AC=P00489;QY=2 SP=BOVIN;CT=protein;AC=P62894;QY=1 not available 1 cell run 1 proteomic profiling by mass spectrometry TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.raw ftp://ftp.pride.ebi.ac.uk/pride-archive/2012/03/PXD000001/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.raw 1 1 TMT128 NT=LTQ Orbitrap Velos;AC=MS:1001742 NT=Oxidation;MT=Variable;TA=M;AC=Unimod:35 NT=Methylthio;TA=C;MT=fixed;AC=UNIMOD:39 NT=TMT6plex;TA=K;MT=Fixed;AC=UNIMOD:737 NT=TMT6plex;PP=Any N-term;MT=Fixed;AC=UNIMOD:737 AC=MS:1001313;NT=Trypsin NT=HCD;AC=PRIDE:0000590 not available not available SP=Yeast;CT=protein;AC=P00924;QY=2.5 SP=BOVIN;CT=protein;AC=P02769;QY=5 SP=RABIT;CT=protein;AC=P00489;QY=2 SP=BOVIN;CT=protein;AC=P62894;QY=1 -Sample 4 Dickeya chrysanthemi whole plant not available not available not available not applicable not available not applicable 1 SP=Yeast;CT=protein;AC=P00924;QY=1 SP=BOVIN;CT=protein;AC=P02769;QY=10 SP=RABIT;CT=protein;AC=P00489;QY=2 SP=BOVIN;CT=protein;AC=P62894;QY=1 not available 1 cell run 1 proteomic profiling by mass spectrometry TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.raw ftp://ftp.pride.ebi.ac.uk/pride-archive/2012/03/PXD000001/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.raw 1 1 TMT129 NT=LTQ Orbitrap Velos;AC=MS:1001742 NT=Oxidation;MT=Variable;TA=M;AC=Unimod:35 NT=Methylthio;TA=C;MT=fixed;AC=UNIMOD:39 NT=TMT6plex;TA=K;MT=Fixed;AC=UNIMOD:737 NT=TMT6plex;PP=Any N-term;MT=Fixed;AC=UNIMOD:737 AC=MS:1001313;NT=Trypsin NT=HCD;AC=PRIDE:0000590 not available not available SP=Yeast;CT=protein;AC=P00924;QY=1 SP=BOVIN;CT=protein;AC=P02769;QY=10 SP=RABIT;CT=protein;AC=P00489;QY=2 SP=BOVIN;CT=protein;AC=P62894;QY=1 -Sample 5 Dickeya chrysanthemi whole plant not available not available not available not applicable not available not applicable 1 SP=Yeast;CT=protein;AC=P00924;QY=2.5 SP=BOVIN;CT=protein;AC=P02769;QY=5 SP=RABIT;CT=protein;AC=P00489;QY=1 SP=BOVIN;CT=protein;AC=P62894;QY=1 not available 1 cell run 1 proteomic profiling by mass spectrometry TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.raw ftp://ftp.pride.ebi.ac.uk/pride-archive/2012/03/PXD000001/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.raw 1 1 TMT130 NT=LTQ Orbitrap Velos;AC=MS:1001742 NT=Oxidation;MT=Variable;TA=M;AC=Unimod:35 NT=Methylthio;TA=C;MT=fixed;AC=UNIMOD:39 NT=TMT6plex;TA=K;MT=Fixed;AC=UNIMOD:737 NT=TMT6plex;PP=Any N-term;MT=Fixed;AC=UNIMOD:737 AC=MS:1001313;NT=Trypsin NT=HCD;AC=PRIDE:0000590 not available not available SP=Yeast;CT=protein;AC=P00924;QY=2.5 SP=BOVIN;CT=protein;AC=P02769;QY=5 SP=RABIT;CT=protein;AC=P00489;QY=1 SP=BOVIN;CT=protein;AC=P62894;QY=1 -Sample 6 Dickeya chrysanthemi whole plant not available not available not available not applicable not available not applicable 1 SP=Yeast;CT=protein;AC=P00924;QY=10 SP=BOVIN;CT=protein;AC=P02769;QY=1 SP=RABIT;CT=protein;AC=P00489;QY=1 SP=BOVIN;CT=protein;AC=P62894;QY=2 not available 1 cell run 1 proteomic profiling by mass spectrometry TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.raw ftp://ftp.pride.ebi.ac.uk/pride-archive/2012/03/PXD000001/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.raw 1 1 TMT131 NT=LTQ Orbitrap Velos;AC=MS:1001742 NT=Oxidation;MT=Variable;TA=M;AC=Unimod:35 NT=Methylthio;TA=C;MT=fixed;AC=UNIMOD:39 NT=TMT6plex;TA=K;MT=Fixed;AC=UNIMOD:737 NT=TMT6plex;PP=Any N-term;MT=Fixed;AC=UNIMOD:737 AC=MS:1001313;NT=Trypsin NT=HCD;AC=PRIDE:0000590 not available not available SP=Yeast;CT=protein;AC=P00924;QY=10 SP=BOVIN;CT=protein;AC=P02769;QY=1 SP=RABIT;CT=protein;AC=P00489;QY=1 SP=BOVIN;CT=protein;AC=P62894;QY=2 From 3efdd072006faf03af7550c36f866487115ca2b2 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 29 Apr 2022 12:14:07 +0100 Subject: [PATCH 02/47] tests of ci moved to test-datasets --- conf/test.config | 2 +- conf/test_dia.config | 4 ++-- conf/test_full.config | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/conf/test.config b/conf/test.config index bb1d959d..5ddca0b8 100644 --- a/conf/test.config +++ b/conf/test.config @@ -25,7 +25,7 @@ params { // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/quantms/testdata/tmt_ci/PXD000001.sdrf.tsv' - database = 'https://raw.githubusercontent.com/daichengxin/proteomicstmt/dev/tmt_testdata/erwinia_carotovora.fasta' + database = 'https://raw.githubusercontent.com/nf-core/test-datasets/quantms/testdata/tmt_ci/erwinia_carotovora.fasta' posterior_probabilities = "percolator" search_engines = "msgf" protein_level_fdr_cutoff = 0.01 diff --git a/conf/test_dia.config b/conf/test_dia.config index 8ddcc6de..bd5eac56 100644 --- a/conf/test_dia.config +++ b/conf/test_dia.config @@ -23,8 +23,8 @@ params { tracedir = "${params.outdir}/pipeline_info" // Input data - input = 'https://raw.githubusercontent.com/daichengxin/quantms/dev/assets/PXD026600.sdrf.tsv' - database = 'ftp://massive.ucsd.edu/MSV000087597/sequence/REF_EColi_K12_UPS1_combined.fasta' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/quantms/testdata/dia_ci/PXD026600.sdrf.tsv' + database = 'https://raw.githubusercontent.com/nf-core/test-datasets/quantms/testdata/dia_ci/REF_EColi_K12_UPS1_combined.fasta' min_pr_mz = 350 max_pr_mz = 950 min_fr_mz = 500 diff --git a/conf/test_full.config b/conf/test_full.config index 1a2e1f1b..25c0998d 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -21,7 +21,7 @@ params { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/quantms/testdata/tmt_ci/PXD000001.sdrf.tsv' quant_method = 'ISO' - database = 'https://raw.githubusercontent.com/daichengxin/proteomicstmt/dev/tmt_testdata/erwinia_carotovora.fasta' + database = 'https://raw.githubusercontent.com/nf-core/test-datasets/quantms/testdata/tmt_ci/erwinia_carotovora.fasta' posterior_probabilities = "percolator" search_engines = "comet,msgf" protein_level_fdr_cutoff = 0.01 From 0e80c77c91500ed521c5192dd2b1a2581dfa42b3 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 29 Apr 2022 12:18:55 +0100 Subject: [PATCH 03/47] PXD026600.sdrf.tsv removed --- assets/PXD026600.sdrf.tsv | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 assets/PXD026600.sdrf.tsv diff --git a/assets/PXD026600.sdrf.tsv b/assets/PXD026600.sdrf.tsv deleted file mode 100644 index ed709b07..00000000 --- a/assets/PXD026600.sdrf.tsv +++ /dev/null @@ -1,5 +0,0 @@ -Source Name Characteristics[organism] Characteristics[organism part] Characteristics[age] Characteristics[strain] Characteristics[developmental stage] Characteristics[cell line] Characteristics[cell type] Characteristics[sex] Characteristics[disease] characteristics[mass] characteristics[spiked compound] Characteristics[biological replicate] Material Type assay name technology type comment[data file] comment[file uri] comment[technical replicate] comment[fraction identifier] comment[proteomics data acquisition method] comment[label] comment[instrument] comment[modification parameters] comment[modification parameters] comment[cleavage agent details] comment[dissociation method] comment[collision energy] comment[precursor mass tolerance] comment[fragment mass tolerance] factor value[spiked compound] -Sample 1 Escherichia coli K-12 whole plant not available K12 not available not applicable not available not available not available 1 ug CT=Mixture;CN=UPS1;QY=0.1 fmol 1 whole organism run 1 proteomic profiling by mass spectrometry RD139_Narrow_UPS1_0_1fmol_inj1.raw ftp://massive.ucsd.edu/MSV000087597/raw/RD139_Raw_files_DIA_Narrow/RD139_Narrow_UPS1_0_1fmol_inj1.raw 1 1 NT=Data-Independent Acquisition;AC=NCIT:C161786 AC=MS:1002038;NT=label free sample NT=Orbitrap Fusion;AC=MS:1002416 NT=Oxidation;MT=Variable;TA=M;AC=Unimod:35 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 AC=MS:1001313;NT=Trypsin NT=HCD;AC=PRIDE:0000590 35% CE 10 ppm 20 mmu CT=Mixture;CN=UPS1;QY=0.1 fmol -Sample 1 Escherichia coli K-12 whole plant not available K12 not available not applicable not available not available not available 1 ug CT=Mixture;CN=UPS1;QY=0.1 fmol 1 whole organism run 2 proteomic profiling by mass spectrometry RD139_Narrow_UPS1_0_1fmol_inj2.raw ftp://massive.ucsd.edu/MSV000087597/raw/RD139_Raw_files_DIA_Narrow/RD139_Narrow_UPS1_0_1fmol_inj2.raw 2 1 NT=Data-Independent Acquisition;AC=NCIT:C161786 AC=MS:1002038;NT=label free sample NT=Orbitrap Fusion;AC=MS:1002416 NT=Oxidation;MT=Variable;TA=M;AC=Unimod:35 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 AC=MS:1001313;NT=Trypsin NT=HCD;AC=PRIDE:0000590 35% CE 10 ppm 20 mmu CT=Mixture;CN=UPS1;QY=0.1 fmol -Sample 2 Escherichia coli K-12 whole plant not available K12 not available not applicable not available not available not available 1 ug CT=Mixture;CN=UPS1;QY=0.25 fmol 1 whole organism run 3 proteomic profiling by mass spectrometry RD139_Narrow_UPS1_0_25fmol_inj1.raw ftp://massive.ucsd.edu/MSV000087597/raw/RD139_Raw_files_DIA_Narrow/RD139_Narrow_UPS1_0_25fmol_inj1.raw 1 1 NT=Data-Independent Acquisition;AC=NCIT:C161786 AC=MS:1002038;NT=label free sample NT=Orbitrap Fusion;AC=MS:1002416 NT=Oxidation;MT=Variable;TA=M;AC=Unimod:35 NT=Carbamidomethyl;TA=C;MT=Fixed;AC=UNIMOD:4 AC=MS:1001313;NT=Trypsin NT=HCD;AC=PRIDE:0000590 35% CE 10 ppm 20 mmu CT=Mixture;CN=UPS1;QY=0.25 fmol -Sample 2 Escherichia coli K-12 whole plant not available K12 not available not applicable not available not available not available 1 ug CT=Mixture;CN=UPS1;QY=0.25 fmol 1 whole organism run 4 proteomic profiling by mass spectrometry RD139_Narrow_UPS1_0_25fmol_inj2.raw ftp://massive.ucsd.edu/MSV000087597/raw/RD139_Raw_files_DIA_Narrow/RD139_Narrow_UPS1_0_25fmol_inj2.raw 2 1 NT=Data-Independent Acquisition;AC=NCIT:C161786 AC=MS:1002038;NT=label free sample NT=Orbitrap Fusion;AC=MS:1002416 NT=Oxidation;MT=Variable;TA=M;AC=Unimod:35 NT=Carbamidomethyl;TA=C;MT=Fixed;AC=UNIMOD:4 AC=MS:1001313;NT=Trypsin NT=HCD;AC=PRIDE:0000590 35% CE 10 ppm 20 mmu CT=Mixture;CN=UPS1;QY=0.25 fmol From 060cd3b60248f72dbf9ca76dbfcb1c3a7279040a Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 29 Apr 2022 17:39:50 +0100 Subject: [PATCH 04/47] first iteration of utils msstats_utils.R --- bin/msstats_plfq.R | 33 +++++++++------------------------ bin/msstats_tmt.R | 27 ++++++++------------------- bin/msstats_utils.R | 29 +++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 43 deletions(-) create mode 100644 bin/msstats_utils.R diff --git a/bin/msstats_plfq.R b/bin/msstats_plfq.R index 7689e745..9bda48ca 100755 --- a/bin/msstats_plfq.R +++ b/bin/msstats_plfq.R @@ -1,25 +1,15 @@ #!/usr/bin/env Rscript -args = commandArgs(trailingOnly=TRUE) -char_to_boolean = c("true"=TRUE, "false"=FALSE) + +require(MSstats) +require(tibble) +require(data.table) +include(msstats_utils.R) + usage <- "Rscript msstats_plfq.R input.csv [list of contrasts or 'pairwise'] [default control condition or ''] ..." +char_to_boolean = c("true"=TRUE, "false"=FALSE) + +args = initialize_msstats(usage = usage) -#TODO rewrite mzTab in next version -if (length(args)<1) { - print(usage) - stop("At least the first argument must be supplied (input csv).n", call.=FALSE) -} -if (length(args)<2) { - # contrasts - args[2] = "pairwise" -} -if (length(args)<3) { - # default control condition - args[3] = "" -} -if (length(args)<4) { - # removeOneFeatProts - args[4] = FALSE -} removeOneFeatProts = args[4] if(typeof(removeOneFeatProts) == 'character'){ removeOneFeatProts = char_to_boolean[removeOneFeatProts] @@ -51,11 +41,6 @@ csv_input <- args[1] contrast_str <- args[2] control_str <- args[3] -# load the MSstats library -require(MSstats) -require(tibble) -require(data.table) - # helper functions make_contrasts <- function(contrasts, levels) { diff --git a/bin/msstats_tmt.R b/bin/msstats_tmt.R index e6d578e7..fb15cc58 100755 --- a/bin/msstats_tmt.R +++ b/bin/msstats_tmt.R @@ -1,25 +1,14 @@ #!/usr/bin/env Rscript -args = commandArgs(trailingOnly=TRUE) -char_to_boolean = c("true"=TRUE, "false"=FALSE) + +require(MSstatsTMT) +include(msstats_utils.R) + usage <- "Rscript msstats_tmt.R input.csv [list of contrasts or 'pairwise'] [default control condition or '']... [normalization based reference channel]" -if (length(args)<1) { - print(usage) - stop("At least the first argument must be supplied (input csv).n", call.=FALSE) -} -if (length(args)<2) { - # contrasts - args[2] = "pairwise" -} -if (length(args)<3) { - # default control condition - args[3] = "" -} +char_to_boolean = c("true"=TRUE, "false"=FALSE) + +args = initialize_msstats(usage = usage) -if (length(args)<4) { - # removeOneFeatProts - args[4] = FALSE -} rmProtein_with1Feature = args[4] if(typeof(rmProtein_with1Feature) == 'character'){ rmProtein_with1Feature = char_to_boolean[rmProtein_with1Feature] @@ -38,6 +27,7 @@ if (length(args)<6) { # remove the features that have 1 or 2 measurements within each Run. args[6] = TRUE } + rmPSM_withfewMea_withinRun = args[6] if(typeof(rmPSM_withfewMea_withinRun) == 'character'){ rmPSM_withfewMea_withinRun = char_to_boolean[rmPSM_withfewMea_withinRun] @@ -85,7 +75,6 @@ csv_input <- args[1] contrast_str <- args[2] control_str <- args[3] -require(MSstatsTMT) # read dataframe into MSstatsTMT data <- read.csv(csv_input) quant <- OpenMStoMSstatsTMTFormat(data, useUniquePeptide=useUniquePeptide, rmPSM_withfewMea_withinRun=rmPSM_withfewMea_withinRun, diff --git a/bin/msstats_utils.R b/bin/msstats_utils.R new file mode 100644 index 00000000..030ab57f --- /dev/null +++ b/bin/msstats_utils.R @@ -0,0 +1,29 @@ + + +initialize_msstats <- function (usage){ + args = commandArgs(trailingOnly=TRUE) + if (length(args)<1) { + print(usage) + stop("At least the first argument must be supplied (input csv).n", call.=FALSE) + } + if (length(args)<2) { + args[2] = "pairwise" + } + + if (length(args)<3) { + # default control condition + args[3] = "" + } + + if (length(args)<4) { + # removeOneFeatProts + args[4] = FALSE + } + + return(args) + +} + + + + From 666ffdee880d58ea2af7e285c6aace2a92412dfd Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 29 Apr 2022 17:43:24 +0100 Subject: [PATCH 05/47] first iteration of utils msstats_utils.R --- bin/msstats_utils.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bin/msstats_utils.R b/bin/msstats_utils.R index 030ab57f..7df4eaa1 100644 --- a/bin/msstats_utils.R +++ b/bin/msstats_utils.R @@ -7,7 +7,7 @@ initialize_msstats <- function (usage){ stop("At least the first argument must be supplied (input csv).n", call.=FALSE) } if (length(args)<2) { - args[2] = "pairwise" + args[2] = "pairwise" } if (length(args)<3) { @@ -17,11 +17,10 @@ initialize_msstats <- function (usage){ if (length(args)<4) { # removeOneFeatProts - args[4] = FALSE + args[4] = FALSE } return(args) - } From 6082af030c5e23d511f1b10e7306b6182cc40af3 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 29 Apr 2022 17:44:36 +0100 Subject: [PATCH 06/47] first iteration of utils msstats_utils.R --- bin/msstats_utils.R | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/bin/msstats_utils.R b/bin/msstats_utils.R index 7df4eaa1..487c482d 100644 --- a/bin/msstats_utils.R +++ b/bin/msstats_utils.R @@ -1,25 +1,24 @@ initialize_msstats <- function (usage){ - args = commandArgs(trailingOnly=TRUE) + args <- commandArgs(trailingOnly=TRUE) if (length(args)<1) { print(usage) stop("At least the first argument must be supplied (input csv).n", call.=FALSE) } if (length(args)<2) { - args[2] = "pairwise" + args[2] <- "pairwise" } if (length(args)<3) { # default control condition - args[3] = "" + args[3] <- "" } if (length(args)<4) { # removeOneFeatProts - args[4] = FALSE + args[4] <- FALSE } - return(args) } From d6324f14a6d8eb72e07b06c697d054a9f8391c3f Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 29 Apr 2022 17:55:23 +0100 Subject: [PATCH 07/47] first iteration of utils msstats_utils.R --- bin/msstats_plfq.R | 2 +- bin/msstats_tmt.R | 2 +- bin/msstats_utils.R | 5 +++++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/bin/msstats_plfq.R b/bin/msstats_plfq.R index 9bda48ca..eb9f36f5 100755 --- a/bin/msstats_plfq.R +++ b/bin/msstats_plfq.R @@ -3,7 +3,7 @@ require(MSstats) require(tibble) require(data.table) -include(msstats_utils.R) +source(msstats_utils.R) usage <- "Rscript msstats_plfq.R input.csv [list of contrasts or 'pairwise'] [default control condition or ''] ..." char_to_boolean = c("true"=TRUE, "false"=FALSE) diff --git a/bin/msstats_tmt.R b/bin/msstats_tmt.R index fb15cc58..8dfc44a9 100755 --- a/bin/msstats_tmt.R +++ b/bin/msstats_tmt.R @@ -1,7 +1,7 @@ #!/usr/bin/env Rscript require(MSstatsTMT) -include(msstats_utils.R) +source(msstats_utils.R) usage <- "Rscript msstats_tmt.R input.csv [list of contrasts or 'pairwise'] [default control condition or '']... [normalization based reference channel]" diff --git a/bin/msstats_utils.R b/bin/msstats_utils.R index 487c482d..63d49438 100644 --- a/bin/msstats_utils.R +++ b/bin/msstats_utils.R @@ -1,4 +1,9 @@ +#' Inizialize the TMT and LFQ parameters +#' +#' @param usage message to exit the script analysis +#' +#' @return initialize_msstats <- function (usage){ args <- commandArgs(trailingOnly=TRUE) From ecf4d34536ad5d47bed79d4c702169b421e38f10 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 29 Apr 2022 18:07:18 +0100 Subject: [PATCH 08/47] first iteration of utils msstats_utils.R --- bin/msstats_plfq.R | 2 +- bin/msstats_tmt.R | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/msstats_plfq.R b/bin/msstats_plfq.R index eb9f36f5..a438140a 100755 --- a/bin/msstats_plfq.R +++ b/bin/msstats_plfq.R @@ -3,7 +3,7 @@ require(MSstats) require(tibble) require(data.table) -source(msstats_utils.R) +source("msstats_utils.R") usage <- "Rscript msstats_plfq.R input.csv [list of contrasts or 'pairwise'] [default control condition or ''] ..." char_to_boolean = c("true"=TRUE, "false"=FALSE) diff --git a/bin/msstats_tmt.R b/bin/msstats_tmt.R index 8dfc44a9..227d2811 100755 --- a/bin/msstats_tmt.R +++ b/bin/msstats_tmt.R @@ -1,7 +1,7 @@ #!/usr/bin/env Rscript require(MSstatsTMT) -source(msstats_utils.R) +source("msstats_utils.R") usage <- "Rscript msstats_tmt.R input.csv [list of contrasts or 'pairwise'] [default control condition or '']... [normalization based reference channel]" @@ -136,6 +136,7 @@ if (l == 1) contrast_lst <- unlist(strsplit(contrast_str,";")) contrast_mat <- make_contrasts(contrast_lst, lvls) } + print ("Contrasts to be tested:") print (contrast_mat) #TODO allow for user specified contrasts From fa6cf686d94df950cae7b1474a18ed02aedacfa5 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 29 Apr 2022 18:29:23 +0100 Subject: [PATCH 09/47] first iteration of utils msstats_utils.R --- bin/msstats_plfq.R | 2 +- bin/msstats_tmt.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/msstats_plfq.R b/bin/msstats_plfq.R index a438140a..3aa11eb7 100755 --- a/bin/msstats_plfq.R +++ b/bin/msstats_plfq.R @@ -3,7 +3,7 @@ require(MSstats) require(tibble) require(data.table) -source("msstats_utils.R") +source("./msstats_utils.R") usage <- "Rscript msstats_plfq.R input.csv [list of contrasts or 'pairwise'] [default control condition or ''] ..." char_to_boolean = c("true"=TRUE, "false"=FALSE) diff --git a/bin/msstats_tmt.R b/bin/msstats_tmt.R index 227d2811..fd05156f 100755 --- a/bin/msstats_tmt.R +++ b/bin/msstats_tmt.R @@ -1,7 +1,7 @@ #!/usr/bin/env Rscript require(MSstatsTMT) -source("msstats_utils.R") +source("./msstats_utils.R") usage <- "Rscript msstats_tmt.R input.csv [list of contrasts or 'pairwise'] [default control condition or '']... [normalization based reference channel]" From 39454f7550883d86637050654a1792af652e7c36 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 29 Apr 2022 18:46:12 +0100 Subject: [PATCH 10/47] first iteration of utils msstats_utils.R --- bin/msstats_plfq.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/msstats_plfq.R b/bin/msstats_plfq.R index 3aa11eb7..f9929f6a 100755 --- a/bin/msstats_plfq.R +++ b/bin/msstats_plfq.R @@ -3,6 +3,8 @@ require(MSstats) require(tibble) require(data.table) + +setwd(dirname(getwd())) source("./msstats_utils.R") usage <- "Rscript msstats_plfq.R input.csv [list of contrasts or 'pairwise'] [default control condition or ''] ..." From 5d265c79a7ccf6102030e810aa3dc5557e5ea9ef Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 29 Apr 2022 19:27:28 +0100 Subject: [PATCH 11/47] first iteration of utils msstats_utils.R --- bin/msstats_plfq.R | 25 ++++++++++++------------- bin/msstats_tmt.R | 42 +++++++++++++++++++++--------------------- 2 files changed, 33 insertions(+), 34 deletions(-) diff --git a/bin/msstats_plfq.R b/bin/msstats_plfq.R index f9929f6a..513ae340 100755 --- a/bin/msstats_plfq.R +++ b/bin/msstats_plfq.R @@ -4,39 +4,38 @@ require(MSstats) require(tibble) require(data.table) -setwd(dirname(getwd())) -source("./msstats_utils.R") +source(here::here('msstats_utils.R')) usage <- "Rscript msstats_plfq.R input.csv [list of contrasts or 'pairwise'] [default control condition or ''] ..." -char_to_boolean = c("true"=TRUE, "false"=FALSE) +char_to_boolean <- c("true"=TRUE, "false"=FALSE) -args = initialize_msstats(usage = usage) +args <- initialize_msstats(usage = usage) -removeOneFeatProts = args[4] +removeOneFeatProts <- args[4] if(typeof(removeOneFeatProts) == 'character'){ - removeOneFeatProts = char_to_boolean[removeOneFeatProts] + removeOneFeatProts <- char_to_boolean[removeOneFeatProts] } if (length(args)<5) { # keeps features with only one or two measurements across runs - args[5] = TRUE + args[5] <- TRUE } -removeFewMeasurements = args[5] +removeFewMeasurements <- args[5] if(typeof(removeFewMeasurements) == 'character'){ - removeFewMeasurements = char_to_boolean[removeFewMeasurements] + removeFewMeasurements <- char_to_boolean[removeFewMeasurements] } if (length(args)<6) { # which features to use for quantification per protein: 'top3' or 'highQuality' which removes outliers only" - args[6] = 'top3' + args[6] <- 'top3' } if (length(args)<7) { # which summary method to use: 'TMP' (Tukey's median polish) or 'linear' (linear mixed model) - args[7] = 'TMP' + args[7] <- 'TMP' } if (length(args)<8) { # outputPrefix - args[8] = './msstats' + args[8] <- './msstats' } csv_input <- args[1] @@ -207,7 +206,7 @@ if (l == 1) groupComparisonPlots(data=test.MSstats$ComparisonResult, type="ComparisonPlot", width=12, height=12,dot.size = 2) - test.MSstats$Volcano = test.MSstats$ComparisonResult[!is.na(test.MSstats$ComparisonResult$pvalue),] + test.MSstats$Volcano <- test.MSstats$ComparisonResult[!is.na(test.MSstats$ComparisonResult$pvalue),] groupComparisonPlots(data=test.MSstats$Volcano, type="VolcanoPlot", width=12, height=12,dot.size = 2) diff --git a/bin/msstats_tmt.R b/bin/msstats_tmt.R index fd05156f..80c3c50e 100755 --- a/bin/msstats_tmt.R +++ b/bin/msstats_tmt.R @@ -5,70 +5,70 @@ source("./msstats_utils.R") usage <- "Rscript msstats_tmt.R input.csv [list of contrasts or 'pairwise'] [default control condition or '']... [normalization based reference channel]" -char_to_boolean = c("true"=TRUE, "false"=FALSE) +char_to_boolean <- c("true"=TRUE, "false"=FALSE) -args = initialize_msstats(usage = usage) +args <- initialize_msstats(usage = usage) -rmProtein_with1Feature = args[4] +rmProtein_with1Feature <- args[4] if(typeof(rmProtein_with1Feature) == 'character'){ - rmProtein_with1Feature = char_to_boolean[rmProtein_with1Feature] + rmProtein_with1Feature <- char_to_boolean[rmProtein_with1Feature] } if (length(args)<5) { # use unique peptide - args[5] = TRUE + args[5] <- TRUE } -useUniquePeptide = args[5] +useUniquePeptide <- args[5] if(typeof(useUniquePeptide) == 'character'){ - useUniquePeptide = char_to_boolean[useUniquePeptide] + useUniquePeptide <- char_to_boolean[useUniquePeptide] } if (length(args)<6) { # remove the features that have 1 or 2 measurements within each Run. - args[6] = TRUE + args[6] <- TRUE } -rmPSM_withfewMea_withinRun = args[6] +rmPSM_withfewMea_withinRun <- args[6] if(typeof(rmPSM_withfewMea_withinRun) == 'character'){ - rmPSM_withfewMea_withinRun = char_to_boolean[rmPSM_withfewMea_withinRun] + rmPSM_withfewMea_withinRun <- char_to_boolean[rmPSM_withfewMea_withinRun] } if (length(args)<7) { # sum or max - when there are multiple measurements for certain feature in certain Run. - args[7] = 'sum' + args[7] <- 'sum' } if (length(args)<8) { # summarization methods to protein-level can be performed: "msstats(default)" - args[8] = "msstats" + args[8] <- "msstats" } if (length(args)<9) { # Global median normalization on peptide level data - args[9] = TRUE + args[9] <- TRUE } -global_norm = args[9] +global_norm <- args[9] if(typeof(global_norm) == 'character'){ - global_norm = char_to_boolean[global_norm] + global_norm <- char_to_boolean[global_norm] } if (length(args)<10) { # Remove norm channel - args[10] = TRUE + args[10] <- TRUE } -remove_norm_channel = args[10] +remove_norm_channel <- args[10] if(typeof(remove_norm_channel) == 'character'){ - remove_norm_channel = char_to_boolean[remove_norm_channel] + remove_norm_channel <- char_to_boolean[remove_norm_channel] } if (length(args)<11) { # default Reference channel based normalization between MS runs on protein level data. # Reference Channel annotated by 'Norm' in Condition. - args[11] = TRUE + args[11] <- TRUE } -reference_norm = args[11] +reference_norm <- args[11] if(typeof(reference_norm) == 'character'){ - reference_norm = char_to_boolean[reference_norm] + reference_norm <- char_to_boolean[reference_norm] } csv_input <- args[1] From 7ed58f13c987c7d9786a1791a464130a0590e10d Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 29 Apr 2022 19:38:46 +0100 Subject: [PATCH 12/47] first iteration of utils msstats_utils.R --- bin/msstats_plfq.R | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/msstats_plfq.R b/bin/msstats_plfq.R index 513ae340..310d4c60 100755 --- a/bin/msstats_plfq.R +++ b/bin/msstats_plfq.R @@ -3,6 +3,7 @@ require(MSstats) require(tibble) require(data.table) +require(here) source(here::here('msstats_utils.R')) From 606c041f803695caf8009aeeb75ba3f47a93737c Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 29 Apr 2022 19:50:49 +0100 Subject: [PATCH 13/47] first iteration of utils msstats_utils.R --- modules/local/msstats/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/msstats/main.nf b/modules/local/msstats/main.nf index 905012ce..4c4aa9ba 100644 --- a/modules/local/msstats/main.nf +++ b/modules/local/msstats/main.nf @@ -1,7 +1,7 @@ process MSSTATS { label 'process_medium' - conda (params.enable_conda ? "bioconda::bioconductor-msstats=4.2.0" : null) + conda (params.enable_conda ? "bioconda::bioconductor-msstats=4.2.0 conda-forge::r-here" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/bioconductor-msstats:4.2.0--r41h619a076_1" } else { From dfd691921a364fa24181fa279489df5ed34e60ca Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 29 Apr 2022 20:02:28 +0100 Subject: [PATCH 14/47] first iteration of utils msstats_utils.R --- bin/msstats_plfq.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/msstats_plfq.R b/bin/msstats_plfq.R index 310d4c60..35bf6f45 100755 --- a/bin/msstats_plfq.R +++ b/bin/msstats_plfq.R @@ -5,7 +5,7 @@ require(tibble) require(data.table) require(here) -source(here::here('msstats_utils.R')) +source(here::here('bin/msstats_utils.R')) usage <- "Rscript msstats_plfq.R input.csv [list of contrasts or 'pairwise'] [default control condition or ''] ..." char_to_boolean <- c("true"=TRUE, "false"=FALSE) From bfa03577a4a88d0fcb1f4e7e0b06e8e2ca2a95cc Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 29 Apr 2022 20:10:03 +0100 Subject: [PATCH 15/47] first iteration of utils msstats_utils.R --- bin/msstats_tmt.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/msstats_tmt.R b/bin/msstats_tmt.R index 80c3c50e..34015118 100755 --- a/bin/msstats_tmt.R +++ b/bin/msstats_tmt.R @@ -1,7 +1,7 @@ #!/usr/bin/env Rscript require(MSstatsTMT) -source("./msstats_utils.R") +source("./bin/msstats_utils.R") usage <- "Rscript msstats_tmt.R input.csv [list of contrasts or 'pairwise'] [default control condition or '']... [normalization based reference channel]" From 482573655d8d8b3a313bc93d9503f5ce83e5f40c Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 29 Apr 2022 20:21:50 +0100 Subject: [PATCH 16/47] first iteration of utils msstats_utils.R --- .github/workflows/ci.yml | 2 +- conf/{test_full.config => test_tmt.config} | 0 nextflow.config | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename conf/{test_full.config => test_tmt.config} (100%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b3393fea..8fe11b9f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,7 +31,7 @@ jobs: # Test latest edge release of Nextflow - NXF_VER: "" NXF_EDGE: "1" - test_profile: ["test", "test_lfq", "test_dia", "test_localize"] + test_profile: ["test", "test_lfq", "test_dia", "test_localize", "test_tmt"] exec_profile: ["docker", "conda"] exclude: - test_profile: test_dia diff --git a/conf/test_full.config b/conf/test_tmt.config similarity index 100% rename from conf/test_full.config rename to conf/test_tmt.config diff --git a/nextflow.config b/nextflow.config index 4eb6c34d..2aa9923e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -283,7 +283,7 @@ profiles { } test { includeConfig 'conf/test.config' } test_localize { includeConfig 'conf/test_localize.config'} - test_full { includeConfig 'conf/test_full.config' } + test_tmt { includeConfig 'conf/test_tmt.config' } test_lfq { includeConfig 'conf/test_lfq.config' } test_dia { includeConfig 'conf/test_dia.config' } mambaci { includeConfig 'conf/mambaci.config' } From 154830f6d7e7d4b67077ed1adc94b09baf46f8b0 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sat, 30 Apr 2022 08:32:00 +0100 Subject: [PATCH 17/47] first iteration of utils msstats_utils.R --- bin/msstats_plfq.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/msstats_plfq.R b/bin/msstats_plfq.R index 35bf6f45..4609d43a 100755 --- a/bin/msstats_plfq.R +++ b/bin/msstats_plfq.R @@ -3,9 +3,9 @@ require(MSstats) require(tibble) require(data.table) -require(here) +# require(here) -source(here::here('bin/msstats_utils.R')) +source('bin/msstats_utils.R') usage <- "Rscript msstats_plfq.R input.csv [list of contrasts or 'pairwise'] [default control condition or ''] ..." char_to_boolean <- c("true"=TRUE, "false"=FALSE) From 13784fa0b39c06d020429f0c418715ddbb97da90 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sat, 30 Apr 2022 08:54:20 +0100 Subject: [PATCH 18/47] rollback changes --- bin/msstats_plfq.R | 53 +++++++++++++++++++++++-------------- bin/msstats_tmt.R | 66 ++++++++++++++++++++++++++-------------------- 2 files changed, 71 insertions(+), 48 deletions(-) diff --git a/bin/msstats_plfq.R b/bin/msstats_plfq.R index 4609d43a..7689e745 100755 --- a/bin/msstats_plfq.R +++ b/bin/msstats_plfq.R @@ -1,48 +1,61 @@ #!/usr/bin/env Rscript - -require(MSstats) -require(tibble) -require(data.table) -# require(here) - -source('bin/msstats_utils.R') - +args = commandArgs(trailingOnly=TRUE) +char_to_boolean = c("true"=TRUE, "false"=FALSE) usage <- "Rscript msstats_plfq.R input.csv [list of contrasts or 'pairwise'] [default control condition or ''] ..." -char_to_boolean <- c("true"=TRUE, "false"=FALSE) -args <- initialize_msstats(usage = usage) - -removeOneFeatProts <- args[4] +#TODO rewrite mzTab in next version +if (length(args)<1) { + print(usage) + stop("At least the first argument must be supplied (input csv).n", call.=FALSE) +} +if (length(args)<2) { + # contrasts + args[2] = "pairwise" +} +if (length(args)<3) { + # default control condition + args[3] = "" +} +if (length(args)<4) { + # removeOneFeatProts + args[4] = FALSE +} +removeOneFeatProts = args[4] if(typeof(removeOneFeatProts) == 'character'){ - removeOneFeatProts <- char_to_boolean[removeOneFeatProts] + removeOneFeatProts = char_to_boolean[removeOneFeatProts] } if (length(args)<5) { # keeps features with only one or two measurements across runs - args[5] <- TRUE + args[5] = TRUE } -removeFewMeasurements <- args[5] +removeFewMeasurements = args[5] if(typeof(removeFewMeasurements) == 'character'){ - removeFewMeasurements <- char_to_boolean[removeFewMeasurements] + removeFewMeasurements = char_to_boolean[removeFewMeasurements] } if (length(args)<6) { # which features to use for quantification per protein: 'top3' or 'highQuality' which removes outliers only" - args[6] <- 'top3' + args[6] = 'top3' } if (length(args)<7) { # which summary method to use: 'TMP' (Tukey's median polish) or 'linear' (linear mixed model) - args[7] <- 'TMP' + args[7] = 'TMP' } if (length(args)<8) { # outputPrefix - args[8] <- './msstats' + args[8] = './msstats' } csv_input <- args[1] contrast_str <- args[2] control_str <- args[3] +# load the MSstats library +require(MSstats) +require(tibble) +require(data.table) + # helper functions make_contrasts <- function(contrasts, levels) { @@ -207,7 +220,7 @@ if (l == 1) groupComparisonPlots(data=test.MSstats$ComparisonResult, type="ComparisonPlot", width=12, height=12,dot.size = 2) - test.MSstats$Volcano <- test.MSstats$ComparisonResult[!is.na(test.MSstats$ComparisonResult$pvalue),] + test.MSstats$Volcano = test.MSstats$ComparisonResult[!is.na(test.MSstats$ComparisonResult$pvalue),] groupComparisonPlots(data=test.MSstats$Volcano, type="VolcanoPlot", width=12, height=12,dot.size = 2) diff --git a/bin/msstats_tmt.R b/bin/msstats_tmt.R index 34015118..e6d578e7 100755 --- a/bin/msstats_tmt.R +++ b/bin/msstats_tmt.R @@ -1,80 +1,91 @@ #!/usr/bin/env Rscript - -require(MSstatsTMT) -source("./bin/msstats_utils.R") - +args = commandArgs(trailingOnly=TRUE) +char_to_boolean = c("true"=TRUE, "false"=FALSE) usage <- "Rscript msstats_tmt.R input.csv [list of contrasts or 'pairwise'] [default control condition or '']... [normalization based reference channel]" -char_to_boolean <- c("true"=TRUE, "false"=FALSE) - -args <- initialize_msstats(usage = usage) +if (length(args)<1) { + print(usage) + stop("At least the first argument must be supplied (input csv).n", call.=FALSE) +} +if (length(args)<2) { + # contrasts + args[2] = "pairwise" +} +if (length(args)<3) { + # default control condition + args[3] = "" +} -rmProtein_with1Feature <- args[4] +if (length(args)<4) { + # removeOneFeatProts + args[4] = FALSE +} +rmProtein_with1Feature = args[4] if(typeof(rmProtein_with1Feature) == 'character'){ - rmProtein_with1Feature <- char_to_boolean[rmProtein_with1Feature] + rmProtein_with1Feature = char_to_boolean[rmProtein_with1Feature] } if (length(args)<5) { # use unique peptide - args[5] <- TRUE + args[5] = TRUE } -useUniquePeptide <- args[5] +useUniquePeptide = args[5] if(typeof(useUniquePeptide) == 'character'){ - useUniquePeptide <- char_to_boolean[useUniquePeptide] + useUniquePeptide = char_to_boolean[useUniquePeptide] } if (length(args)<6) { # remove the features that have 1 or 2 measurements within each Run. - args[6] <- TRUE + args[6] = TRUE } - -rmPSM_withfewMea_withinRun <- args[6] +rmPSM_withfewMea_withinRun = args[6] if(typeof(rmPSM_withfewMea_withinRun) == 'character'){ - rmPSM_withfewMea_withinRun <- char_to_boolean[rmPSM_withfewMea_withinRun] + rmPSM_withfewMea_withinRun = char_to_boolean[rmPSM_withfewMea_withinRun] } if (length(args)<7) { # sum or max - when there are multiple measurements for certain feature in certain Run. - args[7] <- 'sum' + args[7] = 'sum' } if (length(args)<8) { # summarization methods to protein-level can be performed: "msstats(default)" - args[8] <- "msstats" + args[8] = "msstats" } if (length(args)<9) { # Global median normalization on peptide level data - args[9] <- TRUE + args[9] = TRUE } -global_norm <- args[9] +global_norm = args[9] if(typeof(global_norm) == 'character'){ - global_norm <- char_to_boolean[global_norm] + global_norm = char_to_boolean[global_norm] } if (length(args)<10) { # Remove norm channel - args[10] <- TRUE + args[10] = TRUE } -remove_norm_channel <- args[10] +remove_norm_channel = args[10] if(typeof(remove_norm_channel) == 'character'){ - remove_norm_channel <- char_to_boolean[remove_norm_channel] + remove_norm_channel = char_to_boolean[remove_norm_channel] } if (length(args)<11) { # default Reference channel based normalization between MS runs on protein level data. # Reference Channel annotated by 'Norm' in Condition. - args[11] <- TRUE + args[11] = TRUE } -reference_norm <- args[11] +reference_norm = args[11] if(typeof(reference_norm) == 'character'){ - reference_norm <- char_to_boolean[reference_norm] + reference_norm = char_to_boolean[reference_norm] } csv_input <- args[1] contrast_str <- args[2] control_str <- args[3] +require(MSstatsTMT) # read dataframe into MSstatsTMT data <- read.csv(csv_input) quant <- OpenMStoMSstatsTMTFormat(data, useUniquePeptide=useUniquePeptide, rmPSM_withfewMea_withinRun=rmPSM_withfewMea_withinRun, @@ -136,7 +147,6 @@ if (l == 1) contrast_lst <- unlist(strsplit(contrast_str,";")) contrast_mat <- make_contrasts(contrast_lst, lvls) } - print ("Contrasts to be tested:") print (contrast_mat) #TODO allow for user specified contrasts From d94244c52b961dc427425ad7fb70012207b6e47c Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sat, 30 Apr 2022 09:06:53 +0100 Subject: [PATCH 19/47] fixing tmt --- conf/test_tmt.config | 4 ++++ nextflow.config | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/conf/test_tmt.config b/conf/test_tmt.config index 25c0998d..5fa1bae3 100644 --- a/conf/test_tmt.config +++ b/conf/test_tmt.config @@ -17,6 +17,10 @@ params { outdir = "./results_iso_full" tracedir = "${params.outdir}/pipeline_info" + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h + // Input data for full size test input = 'https://raw.githubusercontent.com/nf-core/test-datasets/quantms/testdata/tmt_ci/PXD000001.sdrf.tsv' diff --git a/nextflow.config b/nextflow.config index 2aa9923e..18603cd2 100644 --- a/nextflow.config +++ b/nextflow.config @@ -283,7 +283,7 @@ profiles { } test { includeConfig 'conf/test.config' } test_localize { includeConfig 'conf/test_localize.config'} - test_tmt { includeConfig 'conf/test_tmt.config' } + test_tmt { includeConfig 'conf/test_tmt.config' } test_lfq { includeConfig 'conf/test_lfq.config' } test_dia { includeConfig 'conf/test_dia.config' } mambaci { includeConfig 'conf/mambaci.config' } From a19af0f41b552ff0c27cea648c64e0042fa3ac84 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sat, 30 Apr 2022 09:13:33 +0100 Subject: [PATCH 20/47] adding ci test full --- conf/test_full.config | 36 ++++++++++++++++++++++++++++++++++++ nextflow.config | 13 +++++++------ 2 files changed, 43 insertions(+), 6 deletions(-) create mode 100644 conf/test_full.config diff --git a/conf/test_full.config b/conf/test_full.config new file mode 100644 index 00000000..4e4e61f0 --- /dev/null +++ b/conf/test_full.config @@ -0,0 +1,36 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests (LFQ) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple test. + + Use as follows: + nextflow run nf-core/quantms -profile test_lfq, [--outdir ] + +------------------------------------------------------------------------------------------------ +*/ + +params { + config_profile_name = 'Test profile for DDA LFQ' + config_profile_description = 'Minimal test dataset to check pipeline function of the label-free quantification branch of the pipeline' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h + + outdir = "./results_lfq" + tracedir = "${params.outdir}/pipeline_info" + + // Input data + labelling_type = "label free sample" + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/quantms/testdata/lfq_ci/BSA/BSA_design_urls.tsv' + database = 'https://raw.githubusercontent.com/nf-core/test-datasets/quantms/testdata/lfq_ci/BSA/18Protein_SoCe_Tr_detergents_trace_target_decoy.fasta' + posterior_probabilities = "fit_distributions" + search_engines = "msgf" + decoy_string= "rev" + enable_qc = true + add_triqler_output = true + protein_level_fdr_cutoff = 1.0 + acquisition_method = "dda" +} diff --git a/nextflow.config b/nextflow.config index 18603cd2..2ff9f646 100644 --- a/nextflow.config +++ b/nextflow.config @@ -281,12 +281,13 @@ profiles { podman.enabled = false shifter.enabled = false } - test { includeConfig 'conf/test.config' } - test_localize { includeConfig 'conf/test_localize.config'} - test_tmt { includeConfig 'conf/test_tmt.config' } - test_lfq { includeConfig 'conf/test_lfq.config' } - test_dia { includeConfig 'conf/test_dia.config' } - mambaci { includeConfig 'conf/mambaci.config' } + test { includeConfig 'conf/test.config' } + test_localize { includeConfig 'conf/test_localize.config' } + test_tmt { includeConfig 'conf/test_tmt.config' } + test_lfq { includeConfig 'conf/test_lfq.config' } + test_dia { includeConfig 'conf/test_dia.config' } + mambaci { includeConfig 'conf/mambaci.config' } + test_full { includeConfig 'conf/test_full.config' } } // Load module config after profile, so they can depend on overwritten input parameters specific for each profile. From 00e9108be014521273040ab88b856a828935f1a9 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sat, 30 Apr 2022 09:35:15 +0100 Subject: [PATCH 21/47] clean R --- bin/msstats_plfq.R | 26 ++++++++++++------------ bin/msstats_tmt.R | 49 +++++++++++++++++++++++----------------------- 2 files changed, 38 insertions(+), 37 deletions(-) diff --git a/bin/msstats_plfq.R b/bin/msstats_plfq.R index 7689e745..370f8eaf 100755 --- a/bin/msstats_plfq.R +++ b/bin/msstats_plfq.R @@ -1,6 +1,6 @@ #!/usr/bin/env Rscript -args = commandArgs(trailingOnly=TRUE) -char_to_boolean = c("true"=TRUE, "false"=FALSE) +args <- commandArgs(trailingOnly=TRUE) +char_to_boolean <- c("true"=TRUE, "false"=FALSE) usage <- "Rscript msstats_plfq.R input.csv [list of contrasts or 'pairwise'] [default control condition or ''] ..." #TODO rewrite mzTab in next version @@ -10,41 +10,41 @@ if (length(args)<1) { } if (length(args)<2) { # contrasts - args[2] = "pairwise" + args[2] <- "pairwise" } if (length(args)<3) { # default control condition - args[3] = "" + args[3] <- "" } if (length(args)<4) { # removeOneFeatProts - args[4] = FALSE + args[4] <- FALSE } -removeOneFeatProts = args[4] +removeOneFeatProts <- args[4] if(typeof(removeOneFeatProts) == 'character'){ - removeOneFeatProts = char_to_boolean[removeOneFeatProts] + removeOneFeatProts <- char_to_boolean[removeOneFeatProts] } if (length(args)<5) { # keeps features with only one or two measurements across runs - args[5] = TRUE + args[5] <- TRUE } -removeFewMeasurements = args[5] +removeFewMeasurements <- args[5] if(typeof(removeFewMeasurements) == 'character'){ - removeFewMeasurements = char_to_boolean[removeFewMeasurements] + removeFewMeasurements <- char_to_boolean[removeFewMeasurements] } if (length(args)<6) { # which features to use for quantification per protein: 'top3' or 'highQuality' which removes outliers only" - args[6] = 'top3' + args[6] <- 'top3' } if (length(args)<7) { # which summary method to use: 'TMP' (Tukey's median polish) or 'linear' (linear mixed model) - args[7] = 'TMP' + args[7] <- 'TMP' } if (length(args)<8) { # outputPrefix - args[8] = './msstats' + args[8] <- './msstats' } csv_input <- args[1] diff --git a/bin/msstats_tmt.R b/bin/msstats_tmt.R index e6d578e7..8e8994ab 100755 --- a/bin/msstats_tmt.R +++ b/bin/msstats_tmt.R @@ -1,6 +1,6 @@ #!/usr/bin/env Rscript -args = commandArgs(trailingOnly=TRUE) -char_to_boolean = c("true"=TRUE, "false"=FALSE) +args <- commandArgs(trailingOnly=TRUE) +char_to_boolean <- c("true"=TRUE, "false"=FALSE) usage <- "Rscript msstats_tmt.R input.csv [list of contrasts or 'pairwise'] [default control condition or '']... [normalization based reference channel]" if (length(args)<1) { @@ -9,76 +9,76 @@ if (length(args)<1) { } if (length(args)<2) { # contrasts - args[2] = "pairwise" + args[2] <- "pairwise" } if (length(args)<3) { # default control condition - args[3] = "" + args[3] <- "" } if (length(args)<4) { # removeOneFeatProts - args[4] = FALSE + args[4] <- FALSE } -rmProtein_with1Feature = args[4] +rmProtein_with1Feature <- args[4] if(typeof(rmProtein_with1Feature) == 'character'){ - rmProtein_with1Feature = char_to_boolean[rmProtein_with1Feature] + rmProtein_with1Feature <- char_to_boolean[rmProtein_with1Feature] } if (length(args)<5) { # use unique peptide - args[5] = TRUE + args[5] <- TRUE } -useUniquePeptide = args[5] +useUniquePeptide <- args[5] if(typeof(useUniquePeptide) == 'character'){ - useUniquePeptide = char_to_boolean[useUniquePeptide] + useUniquePeptide <- char_to_boolean[useUniquePeptide] } if (length(args)<6) { # remove the features that have 1 or 2 measurements within each Run. - args[6] = TRUE + args[6] <- TRUE } -rmPSM_withfewMea_withinRun = args[6] +rmPSM_withfewMea_withinRun <- args[6] if(typeof(rmPSM_withfewMea_withinRun) == 'character'){ - rmPSM_withfewMea_withinRun = char_to_boolean[rmPSM_withfewMea_withinRun] + rmPSM_withfewMea_withinRun <- char_to_boolean[rmPSM_withfewMea_withinRun] } if (length(args)<7) { # sum or max - when there are multiple measurements for certain feature in certain Run. - args[7] = 'sum' + args[7] <- 'sum' } if (length(args)<8) { # summarization methods to protein-level can be performed: "msstats(default)" - args[8] = "msstats" + args[8] <- "msstats" } if (length(args)<9) { # Global median normalization on peptide level data - args[9] = TRUE + args[9] <- TRUE } -global_norm = args[9] +global_norm <- args[9] if(typeof(global_norm) == 'character'){ - global_norm = char_to_boolean[global_norm] + global_norm <- char_to_boolean[global_norm] } if (length(args)<10) { # Remove norm channel - args[10] = TRUE + args[10] <- TRUE } -remove_norm_channel = args[10] +remove_norm_channel <- args[10] if(typeof(remove_norm_channel) == 'character'){ - remove_norm_channel = char_to_boolean[remove_norm_channel] + remove_norm_channel <- char_to_boolean[remove_norm_channel] } if (length(args)<11) { # default Reference channel based normalization between MS runs on protein level data. # Reference Channel annotated by 'Norm' in Condition. - args[11] = TRUE + args[11] <- TRUE } -reference_norm = args[11] +reference_norm <- args[11] if(typeof(reference_norm) == 'character'){ - reference_norm = char_to_boolean[reference_norm] + reference_norm <- char_to_boolean[reference_norm] } csv_input <- args[1] @@ -147,6 +147,7 @@ if (l == 1) contrast_lst <- unlist(strsplit(contrast_str,";")) contrast_mat <- make_contrasts(contrast_lst, lvls) } + print ("Contrasts to be tested:") print (contrast_mat) #TODO allow for user specified contrasts From f4d2e0539dd379b6036fe1697c1eae72f6e9aadc Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 29 Apr 2022 17:39:50 +0100 Subject: [PATCH 22/47] clean R --- bin/msstats_utils.R | 66 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 58 insertions(+), 8 deletions(-) diff --git a/bin/msstats_utils.R b/bin/msstats_utils.R index 63d49438..86571110 100644 --- a/bin/msstats_utils.R +++ b/bin/msstats_utils.R @@ -1,32 +1,82 @@ - #' Inizialize the TMT and LFQ parameters #' #' @param usage message to exit the script analysis #' #' @return -initialize_msstats <- function (usage){ - args <- commandArgs(trailingOnly=TRUE) - if (length(args)<1) { +initialize_msstats <- function(usage) { + args <- commandArgs(trailingOnly = TRUE) + if (length(args) < 1) { print(usage) - stop("At least the first argument must be supplied (input csv).n", call.=FALSE) + stop("At least the first argument must be supplied (input csv).n", call. = FALSE) } - if (length(args)<2) { + if (length(args) < 2) { args[2] <- "pairwise" } - if (length(args)<3) { + if (length(args) < 3) { # default control condition args[3] <- "" } - if (length(args)<4) { + if (length(args) < 4) { # removeOneFeatProts args[4] <- FALSE } return(args) } +#' Handle the number of contrasts in the differential expression analysis. +#' It returns a matrix of the contrasts to be analyzed. +#' +#' @param l +#' @param contrast_str +#' @param lvls number of doncitions + +#' @return +#' +parse_contrasts <- function(l, contrast_str, lvls) { + if (contrast_str == "pairwise") { + if (control_str == "") { + contrast_mat <- matrix(nrow = l * (l - 1) / 2, ncol = l, dimnames = list(Contrasts = rep(NA, l * (l - 1) / 2), Levels = lvls)) + c <- 1 + for (i in 1:(l - 1)) { + for (j in (i + 1):l) { + comparison <- rep(0, l) + comparison[i] <- 1 + comparison[j] <- -1 + contrast_mat[c,] <- comparison + rownames(contrast_mat)[c] <- paste0(lvls[i], "-", lvls[j]) + c <- c + 1 + } + } + } else { + control <- which(as.character(lvls) == control_str) + if (length(control) == 0) { + stop("Control condition not part of found levels.n", call. = FALSE) + } + contrast_mat <- matrix(nrow = l - 1, ncol = l, dimnames = list(Contrasts = rep(NA, l - 1), Levels = lvls)) + c <- 1 + for (j in setdiff(1:l, control)) { + comparison <- rep(0, l) + comparison[i] <- -1 + comparison[j] <- 1 + contrast_mat[c,] <- comparison + rownames(contrast_mat)[c] <- paste0(lvls[i], "-", lvls[j]) + c <- c + 1 + } + } + } else { + contrast_lst <- unlist(strsplit(contrast_str, ";")) + contrast_mat <- make_contrasts(contrast_lst, lvls) + } + print("Contrasts to be tested:") + print(contrast_mat) + return(contrast_mat) +} + + + From 3c112853b08ad5979c807b69cd319c7897211e1a Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 1 May 2022 11:42:28 +0100 Subject: [PATCH 23/47] remove test_full.config --- .github/workflows/awsfulltest.yml | 6 +++--- conf/test_full.config | 36 ------------------------------- conf/test_tmt.config | 2 +- nextflow.config | 1 - 4 files changed, 4 insertions(+), 41 deletions(-) delete mode 100644 conf/test_full.config diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index 510bf1cb..1ea40b4d 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -1,7 +1,7 @@ name: nf-core AWS full size tests # This workflow is triggered on published releases. # It can be additionally triggered manually with GitHub actions workflow dispatch button. -# It runs the -profile 'test_full' on AWS batch +# It runs the -profile 'test_lfq' on AWS batch on: release: @@ -17,7 +17,7 @@ jobs: uses: nf-core/tower-action@v3 # TODO nf-core: You can customise AWS full pipeline tests as required # Add full size test data (but still relatively small datasets for few samples) - # on the `test_full.config` test runs with only one set of parameters + # on the `test_lfq.config` test runs with only one set of parameters with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} @@ -28,7 +28,7 @@ jobs: { "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/quantms/results-${{ github.sha }}" } - profiles: test_full,aws_tower + profiles: test_lfq,aws_tower nextflow_config: | process.errorStrategy = 'retry' process.maxRetries = 3 diff --git a/conf/test_full.config b/conf/test_full.config deleted file mode 100644 index 4e4e61f0..00000000 --- a/conf/test_full.config +++ /dev/null @@ -1,36 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Nextflow config file for running minimal tests (LFQ) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Defines input files and everything required to run a fast and simple test. - - Use as follows: - nextflow run nf-core/quantms -profile test_lfq, [--outdir ] - ------------------------------------------------------------------------------------------------- -*/ - -params { - config_profile_name = 'Test profile for DDA LFQ' - config_profile_description = 'Minimal test dataset to check pipeline function of the label-free quantification branch of the pipeline' - - // Limit resources so that this can run on GitHub Actions - max_cpus = 2 - max_memory = 6.GB - max_time = 48.h - - outdir = "./results_lfq" - tracedir = "${params.outdir}/pipeline_info" - - // Input data - labelling_type = "label free sample" - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/quantms/testdata/lfq_ci/BSA/BSA_design_urls.tsv' - database = 'https://raw.githubusercontent.com/nf-core/test-datasets/quantms/testdata/lfq_ci/BSA/18Protein_SoCe_Tr_detergents_trace_target_decoy.fasta' - posterior_probabilities = "fit_distributions" - search_engines = "msgf" - decoy_string= "rev" - enable_qc = true - add_triqler_output = true - protein_level_fdr_cutoff = 1.0 - acquisition_method = "dda" -} diff --git a/conf/test_tmt.config b/conf/test_tmt.config index 5fa1bae3..c49e8718 100644 --- a/conf/test_tmt.config +++ b/conf/test_tmt.config @@ -5,7 +5,7 @@ Defines input files and everything required to run a full size pipeline test. Use as follows: - nextflow run nf-core/quantms -profile test_full, [--outdir ] + nextflow run nf-core/quantms -profile test_tmt, [--outdir ] ---------------------------------------------------------------------------------------- */ diff --git a/nextflow.config b/nextflow.config index 2ff9f646..69320740 100644 --- a/nextflow.config +++ b/nextflow.config @@ -287,7 +287,6 @@ profiles { test_lfq { includeConfig 'conf/test_lfq.config' } test_dia { includeConfig 'conf/test_dia.config' } mambaci { includeConfig 'conf/mambaci.config' } - test_full { includeConfig 'conf/test_full.config' } } // Load module config after profile, so they can depend on overwritten input parameters specific for each profile. From a5dd1bec97c65b1c3a04531f1dafe5f78905460a Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 1 May 2022 11:45:12 +0100 Subject: [PATCH 24/47] remove test_full.config --- .nf-core.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.nf-core.yml b/.nf-core.yml index 778ae193..d434ec30 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -2,3 +2,4 @@ repository_type: pipeline lint: files_exist: - conf/igenomes.config + - conf/test_full.config From 7cb5c3b1c48ac3c16037cc5a467005c49a076e0b Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 1 May 2022 11:58:41 +0100 Subject: [PATCH 25/47] remove test.config --- .github/workflows/awstest.yml | 4 ++-- .github/workflows/ci.yml | 2 +- conf/test.config | 36 ----------------------------------- nextflow.config | 1 - 4 files changed, 3 insertions(+), 40 deletions(-) delete mode 100644 conf/test.config diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index eb46abf6..a6d99339 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -1,6 +1,6 @@ name: nf-core AWS test # This workflow can be triggered manually with the GitHub actions workflow dispatch button. -# It runs the -profile 'test' on AWS batch +# It runs the -profile 'test_tmt' on AWS batch on: workflow_dispatch: @@ -23,7 +23,7 @@ jobs: { "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/quantms/results-test-${{ github.sha }}" } - profiles: test,aws_tower + profiles: test_tmt,aws_tower nextflow_config: | process.errorStrategy = 'retry' process.maxRetries = 3 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8fe11b9f..799f83b8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,7 +31,7 @@ jobs: # Test latest edge release of Nextflow - NXF_VER: "" NXF_EDGE: "1" - test_profile: ["test", "test_lfq", "test_dia", "test_localize", "test_tmt"] + test_profile: ["test_lfq", "test_dia", "test_localize", "test_tmt"] exec_profile: ["docker", "conda"] exclude: - test_profile: test_dia diff --git a/conf/test.config b/conf/test.config deleted file mode 100644 index 5ddca0b8..00000000 --- a/conf/test.config +++ /dev/null @@ -1,36 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Nextflow config file for running minimal tests (ISO) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Defines input files and everything required to run a fast and simple pipeline test. - - Use as follows: - nextflow run nf-core/quantms -profile test, [--outdir ] - -------------------------------------------------------------------------------------------- -*/ - -params { - config_profile_name = 'Test profile DDA ISO' - config_profile_description = 'Minimal test dataset to check pipeline function of the isotopic labelling branch of the pipeline' - - // Limit resources so that this can run on GitHub Actions - max_cpus = 2 - max_memory = '6.GB' - max_time = '6.h' - - outdir = "./results_iso" - tracedir = "${params.outdir}/pipeline_info" - - // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/quantms/testdata/tmt_ci/PXD000001.sdrf.tsv' - - database = 'https://raw.githubusercontent.com/nf-core/test-datasets/quantms/testdata/tmt_ci/erwinia_carotovora.fasta' - posterior_probabilities = "percolator" - search_engines = "msgf" - protein_level_fdr_cutoff = 0.01 - decoy_string = "rev" - add_decoys = true - variable_mods = 'Oxidation (M)' - fixed_mods = 'Methylthio (C)' -} diff --git a/nextflow.config b/nextflow.config index 69320740..e9277f61 100644 --- a/nextflow.config +++ b/nextflow.config @@ -281,7 +281,6 @@ profiles { podman.enabled = false shifter.enabled = false } - test { includeConfig 'conf/test.config' } test_localize { includeConfig 'conf/test_localize.config' } test_tmt { includeConfig 'conf/test_tmt.config' } test_lfq { includeConfig 'conf/test_lfq.config' } From 921882fd7ab14ea678665fe251855022e2a1a379 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 1 May 2022 12:00:55 +0100 Subject: [PATCH 26/47] remove test.config --- .nf-core.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.nf-core.yml b/.nf-core.yml index d434ec30..c021bae9 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -3,3 +3,4 @@ lint: files_exist: - conf/igenomes.config - conf/test_full.config + - conf/test.config From 9dd525e327c0fc3a8b8f8089862f634e7f9e2744 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 1 May 2022 17:33:16 +0100 Subject: [PATCH 27/47] Functions implemented in lfq --- bin/msstats_plfq.R | 322 ++++++++++++++++++++++----------------------- 1 file changed, 159 insertions(+), 163 deletions(-) diff --git a/bin/msstats_plfq.R b/bin/msstats_plfq.R index 370f8eaf..e2cd6d94 100755 --- a/bin/msstats_plfq.R +++ b/bin/msstats_plfq.R @@ -1,25 +1,160 @@ #!/usr/bin/env Rscript -args <- commandArgs(trailingOnly=TRUE) -char_to_boolean <- c("true"=TRUE, "false"=FALSE) -usage <- "Rscript msstats_plfq.R input.csv [list of contrasts or 'pairwise'] [default control condition or ''] ..." -#TODO rewrite mzTab in next version -if (length(args)<1) { - print(usage) - stop("At least the first argument must be supplied (input csv).n", call.=FALSE) -} -if (length(args)<2) { - # contrasts - args[2] <- "pairwise" +# load the MSstats library +require(MSstats) +require(tibble) +require(data.table) + +# TODO: Functions shared between msstats_plfq and msstats_tmt should be merge in msstats_utils.R +# Please functions syncronized between the three scripts until the code can be merged. + +### Begining Functions section + +#' Inizialize the TMT and LFQ parameters +#' +#' @param usage message to exit the script analysis +#' +#' @return + +initialize_msstats <- function(usage) { + args <- commandArgs(trailingOnly = TRUE) + if (length(args) < 1) { + print(usage) + stop("At least the first argument must be supplied (input csv).n", call. = FALSE) + } + if (length(args) < 2) { + args[2] <- "pairwise" + } + + if (length(args) < 3) { + # default control condition + args[3] <- "" + } + + if (length(args) < 4) { + # removeOneFeatProts + args[4] <- FALSE + } + return(args) } -if (length(args)<3) { - # default control condition - args[3] <- "" + +#' Handle the number of contrasts in the differential expression analysis. +#' It returns a matrix of the contrasts to be analyzed. +#' +#' @param l +#' @param contrast_str +#' @param lvls number of doncitions + +#' @return +#' +parse_contrasts <- function(l, contrast_str, lvls) { + if (contrast_str == "pairwise") { + if (control_str == "") { + contrast_mat <- matrix(nrow = l * (l - 1) / 2, ncol = l, dimnames = list(Contrasts = rep(NA, l * (l - 1) / 2), Levels = lvls)) + c <- 1 + for (i in 1:(l - 1)) { + for (j in (i + 1):l) { + comparison <- rep(0, l) + comparison[i] <- 1 + comparison[j] <- -1 + contrast_mat[c,] <- comparison + rownames(contrast_mat)[c] <- paste0(lvls[i], "-", lvls[j]) + c <- c + 1 + } + } + } else { + control <- which(as.character(lvls) == control_str) + if (length(control) == 0) { + stop("Control condition not part of found levels.n", call. = FALSE) + } + contrast_mat <- matrix(nrow = l - 1, ncol = l, dimnames = list(Contrasts = rep(NA, l - 1), Levels = lvls)) + c <- 1 + for (j in setdiff(1:l, control)) { + comparison <- rep(0, l) + comparison[i] <- -1 + comparison[j] <- 1 + contrast_mat[c,] <- comparison + rownames(contrast_mat)[c] <- paste0(lvls[i], "-", lvls[j]) + c <- c + 1 + } + } + } else { + contrast_lst <- unlist(strsplit(contrast_str, ";")) + contrast_mat <- make_contrasts(contrast_lst, lvls) + } + print("Contrasts to be tested:") + print(contrast_mat) + return(contrast_mat) } -if (length(args)<4) { - # removeOneFeatProts - args[4] <- FALSE + +#' This functions hels to define the contrasts that will be compare. +#' +#' @param contrasts +#' @param levels +#' +#' @return +make_contrasts <- function(contrasts, levels) { + #helper function + indicatorRow <- function(pos,len){ + row <- rep(0,len) + row[pos] <- 1 + return(row) + } + + if (is.factor(levels)) levels <- levels(levels) + if (!is.character(levels)) levels <- colnames(levels) + + l <- length(levels) + if (l < 1){ + stop("No levels given") + } + + ncontr <- length(contrasts) + if (ncontr < 1){ + stop("No contrasts given") + } + + levelsenv <- new.env() + for (i in 1:l) { + assign(levels[i], indicatorRow(i,l), pos=levelsenv) + } + + contrastmat <- matrix(0, l, ncontr, dimnames=list(Levels=levels,Contrasts=contrasts)) + for (j in 1:ncontr) { + contrastsj <- parse(text=contrasts[j]) + contrastmat[,j] <- eval(contrastsj, envir=levelsenv) + } + return(t(contrastmat)) } + +#' Get missing samples by condition +#' +#' @param processedData +#' +#' @return +get_missing_in_condition <- function(processedData) { + p <- processedData + n_samples <- aggregate(p$SUBJECT, by = list(p$GROUP), FUN = function(x) {return(length(unique(as.numeric(x))))}) + colnames(n_samples) <- c("GROUP", "n_samples") + p <- p[complete.cases(p["LogIntensities"]),][,c("Protein", "GROUP", "SUBJECT")] + p_dup <- p[!duplicated(p),] + p_dup_agg <- aggregate(p_dup$SUBJECT, by = list(p_dup$Protein, p_dup$GROUP), length) + colnames(p_dup_agg) <- c("Protein", "GROUP", "non_na") + agg_join <- merge(p_dup_agg, n_samples, by = "GROUP") + agg_join$missingInCondition <- 1 - agg_join$non_na / agg_join$n_samples + + p <- dcast(setDT(agg_join), Protein~GROUP, value.var = "missingInCondition") + return(p) + } + +### End Function Sections + +char_to_boolean <- c("true"=TRUE, "false"=FALSE) +usage <- "Rscript msstats_plfq.R input.csv [list of contrasts or 'pairwise'] [default control condition or ''] ..." + +#TODO rewrite mzTab in next version +args <- initialize_msstats(usage = usage) + removeOneFeatProts <- args[4] if(typeof(removeOneFeatProts) == 'character'){ removeOneFeatProts <- char_to_boolean[removeOneFeatProts] @@ -30,6 +165,7 @@ if (length(args)<5) { args[5] <- TRUE } removeFewMeasurements <- args[5] + if(typeof(removeFewMeasurements) == 'character'){ removeFewMeasurements <- char_to_boolean[removeFewMeasurements] } @@ -51,52 +187,6 @@ csv_input <- args[1] contrast_str <- args[2] control_str <- args[3] -# load the MSstats library -require(MSstats) -require(tibble) -require(data.table) - -# helper functions -make_contrasts <- function(contrasts, levels) -{ - #helper function - indicatorRow <- function(pos,len) - { - row <- rep(0,len) - row[pos] <- 1 - return(row) - } - - if (is.factor(levels)) levels <- levels(levels) - if (!is.character(levels)) levels <- colnames(levels) - - l <- length(levels) - if (l < 1) - { - stop("No levels given") - } - - ncontr <- length(contrasts) - if (ncontr < 1) - { - stop("No contrasts given") - } - - levelsenv <- new.env() - for (i in 1:l) - { - assign(levels[i], indicatorRow(i,l), pos=levelsenv) - } - - contrastmat <- matrix(0, l, ncontr, dimnames=list(Levels=levels,Contrasts=contrasts)) - for (j in 1:ncontr) - { - contrastsj <- parse(text=contrasts[j]) - contrastmat[,j] <- eval(contrastsj, envir=levelsenv) - } - return(t(contrastmat)) -} - # read dataframe into MSstats data <- read.csv(csv_input) quant <- OpenMStoMSstatsFormat(data, removeProtein_with1Feature = removeOneFeatProts, removeFewMeasurements=removeFewMeasurements) @@ -106,127 +196,33 @@ processed.quant <- dataProcess(quant, censoredInt = 'NA', featureSubset = args[6 lvls <- levels(as.factor(data$Condition)) l <- length(lvls) -if (l == 1) -{ + +if (l == 1) { print("Only one condition found. No contrasts to be tested. If this is not the case, please check your experimental design.") } else { - if (contrast_str == "pairwise") - { - if (control_str == "") - { - contrast_mat <- matrix(nrow = l * (l-1) / 2, ncol = l, dimnames=list(Contrasts=rep(NA, l * (l-1) / 2), Levels=lvls)) - c <- 1 - for (i in 1:(l-1)) - { - for (j in (i+1):l) - { - comparison <- rep(0,l) - comparison[i] <- 1 - comparison[j] <- -1 - contrast_mat[c,] <- comparison - rownames(contrast_mat)[c] <- paste0(lvls[i],"-",lvls[j]) - c <- c+1 - } - } - } else { - control <- which(as.character(lvls) == control_str) - if (length(control) == 0) - { - stop("Control condition not part of found levels.n", call.=FALSE) - } - - contrast_mat <- matrix(nrow = l-1, ncol = l, dimnames=list(Contrasts=rep(NA, l-1),Levels=lvls)) - c <- 1 - for (j in setdiff(1:l,control)) - { - comparison <- rep(0,l) - comparison[i] <- -1 - comparison[j] <- 1 - contrast_mat[c,] <- comparison - rownames(contrast_mat)[c] <- paste0(lvls[i],"-",lvls[j]) - c <- c+1 - } - } - } else { - contrast_lst <- unlist(strsplit(contrast_str,";")) - contrast_mat <- make_contrasts(contrast_lst, lvls) - } - + contrast_mat <- parse_contrasts(l = l, contrast_str = contrast_str, lvls = lvls) print ("Contrasts to be tested:") print (contrast_mat) #TODO allow for user specified contrasts test.MSstats <- groupComparison(contrast.matrix=contrast_mat, data=processed.quant) - #for (comp in rownames(contrast_mat)) - #{ - # groupComparisonPlots(data=test.MSstats$ComparisonResult, type="ComparisonPlot", - # width=12, height=12,dot.size = 2, sig=1)#, - # which.Comparison = comp, - # address=F) - # # try to plot all comparisons - #} - - - # annotate how often the protein was quantified in each condition (NA values introduced by merge of completely missing are set to 1.0) - ############ also calculate missingness on condition level - - # input: ProcessedData matrix of MSstats - # output: - # calculate fraction of na in condition (per protein) - # Groups: PROTEIN [762] - # PROTEIN `1` `2` - # - # 1 sp|A1ANS1|HTPG_PELPD 0 0.5 - # 2 sp|A2I7N3|SPA37_BOVIN 0 0.5 - # 3 sp|A2VDF0|FUCM_HUMAN 0 0.5 - # 4 sp|A6ND91|ASPD_HUMAN 0.5 0.5 - # 5 sp|A7E3W2|LG3BP_BOVIN 0.5 0.5 - # 6 sp|B8FGT4|ATPB_DESAA 0 0.5 - - getMissingInCondition <- function(processedData) { - p <- processedData - n_samples <- aggregate(p$SUBJECT, by = list(p$GROUP), FUN = function(x) {return(length(unique(as.numeric(x))))}) - colnames(n_samples) <- c("GROUP", "n_samples") - p <- p[complete.cases(p["LogIntensities"]),][,c("Protein", "GROUP", "SUBJECT")] - p_dup <- p[!duplicated(p),] - p_dup_agg <- aggregate(p_dup$SUBJECT, by = list(p_dup$Protein, p_dup$GROUP), length) - colnames(p_dup_agg) <- c("Protein", "GROUP", "non_na") - agg_join <- merge(p_dup_agg, n_samples, by = "GROUP") - agg_join$missingInCondition = 1 - agg_join$non_na / agg_join$n_samples - - p <- dcast(setDT(agg_join), Protein~GROUP, value.var = "missingInCondition") - return(p) - } - - mic <- getMissingInCondition(processed.quant$ProteinLevelData) + mic <- get_missing_in_condition(processed.quant$ProteinLevelData) test.MSstats$ComparisonResult <- merge(x=test.MSstats$ComparisonResult, y=mic, by="Protein") commoncols <- intersect(colnames(mic), colnames(test.MSstats$ComparisonResult)) test.MSstats$ComparisonResult[, commoncols] <- apply(test.MSstats$Comparison[, commoncols], 2, function(x) {x[is.na(x)] <- 1; return(x)}) - #write comparison to CSV (one CSV per contrast) - # writeComparisonToCSV <- function(DF) - # { - # write.table(DF, file=paste0("comparison_",unique(DF$Label),".csv"), quote=FALSE, sep='\t', row.names = FALSE) - # return(DF) - # } - # ComparisonResultSplit <- split(test.MSstats$ComparisonResult, test.MSstats$ComparisonResult$Label) - # for(i in 1:length(ComparisonResultSplit)){ - # writeComparisonToCSV(ComparisonResultSplit[[i]]) - # } - #write all comparisons into one CSV file write.table(test.MSstats$ComparisonResult, file=paste0(args[8],"_comparisons.csv"), quote=FALSE, sep='\t', row.names = FALSE) groupComparisonPlots(data=test.MSstats$ComparisonResult, type="ComparisonPlot", width=12, height=12,dot.size = 2) - test.MSstats$Volcano = test.MSstats$ComparisonResult[!is.na(test.MSstats$ComparisonResult$pvalue),] + test.MSstats$Volcano <- test.MSstats$ComparisonResult[!is.na(test.MSstats$ComparisonResult$pvalue),] groupComparisonPlots(data=test.MSstats$Volcano, type="VolcanoPlot", width=12, height=12,dot.size = 2) # Otherwise it fails since the behaviour is undefined - if (nrow(contrast_mat) > 1) - { + if (nrow(contrast_mat) > 1) { groupComparisonPlots(data=test.MSstats$ComparisonResult, type="Heatmap", width=12, height=12,dot.size = 2) } From 60c8a36b81fe7a4eda802a0981057f7e1dcb14f5 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 1 May 2022 18:12:21 +0100 Subject: [PATCH 28/47] Update bin/msstats_plfq.R Co-authored-by: Julianus Pfeuffer --- bin/msstats_plfq.R | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/msstats_plfq.R b/bin/msstats_plfq.R index e2cd6d94..9c8508cd 100755 --- a/bin/msstats_plfq.R +++ b/bin/msstats_plfq.R @@ -203,7 +203,6 @@ if (l == 1) { contrast_mat <- parse_contrasts(l = l, contrast_str = contrast_str, lvls = lvls) print ("Contrasts to be tested:") print (contrast_mat) - #TODO allow for user specified contrasts test.MSstats <- groupComparison(contrast.matrix=contrast_mat, data=processed.quant) mic <- get_missing_in_condition(processed.quant$ProteinLevelData) From 280dd6da37954121d95f7af8958238e9ccf252ed Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 1 May 2022 18:14:04 +0100 Subject: [PATCH 29/47] Update modules/local/msstats/main.nf Co-authored-by: Julianus Pfeuffer --- modules/local/msstats/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/msstats/main.nf b/modules/local/msstats/main.nf index 4c4aa9ba..905012ce 100644 --- a/modules/local/msstats/main.nf +++ b/modules/local/msstats/main.nf @@ -1,7 +1,7 @@ process MSSTATS { label 'process_medium' - conda (params.enable_conda ? "bioconda::bioconductor-msstats=4.2.0 conda-forge::r-here" : null) + conda (params.enable_conda ? "bioconda::bioconductor-msstats=4.2.0" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/bioconductor-msstats:4.2.0--r41h619a076_1" } else { From 6686fa2c1db154a3909ce8142cc5916ebd493e11 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 1 May 2022 18:23:37 +0100 Subject: [PATCH 30/47] functions added to msstats_tmt.R --- bin/msstats_plfq.R | 3 +- bin/msstats_tmt.R | 224 +++++++++++++++++++++++++++++--------------- bin/msstats_utils.R | 69 +++++++++++++- 3 files changed, 218 insertions(+), 78 deletions(-) diff --git a/bin/msstats_plfq.R b/bin/msstats_plfq.R index e2cd6d94..70459c8e 100755 --- a/bin/msstats_plfq.R +++ b/bin/msstats_plfq.R @@ -15,7 +15,6 @@ require(data.table) #' @param usage message to exit the script analysis #' #' @return - initialize_msstats <- function(usage) { args <- commandArgs(trailingOnly = TRUE) if (length(args) < 1) { @@ -44,7 +43,7 @@ initialize_msstats <- function(usage) { #' @param l #' @param contrast_str #' @param lvls number of doncitions - +#' #' @return #' parse_contrasts <- function(l, contrast_str, lvls) { diff --git a/bin/msstats_tmt.R b/bin/msstats_tmt.R index 8e8994ab..08e80114 100755 --- a/bin/msstats_tmt.R +++ b/bin/msstats_tmt.R @@ -1,25 +1,154 @@ #!/usr/bin/env Rscript -args <- commandArgs(trailingOnly=TRUE) -char_to_boolean <- c("true"=TRUE, "false"=FALSE) +require(MSstatsTMT) + +# TODO: Functions shared between msstats_plfq and msstats_tmt should be merge in msstats_utils.R +# Please functions syncronized between the three scripts until the code can be merged. + +### Begining Functions section + +#' Inizialize the TMT and LFQ parameters +#' +#' @param usage message to exit the script analysis +#' +#' @return +initialize_msstats <- function(usage) { + args <- commandArgs(trailingOnly = TRUE) + if (length(args) < 1) { + print(usage) + stop("At least the first argument must be supplied (input csv).n", call. = FALSE) + } + if (length(args) < 2) { + args[2] <- "pairwise" + } + + if (length(args) < 3) { + # default control condition + args[3] <- "" + } + + if (length(args) < 4) { + # removeOneFeatProts + args[4] <- FALSE + } + return(args) +} + +#' Handle the number of contrasts in the differential expression analysis. +#' It returns a matrix of the contrasts to be analyzed. +#' +#' @param l +#' @param contrast_str +#' @param lvls number of doncitions +#' +#' @return +#' +parse_contrasts <- function(l, contrast_str, lvls) { + if (contrast_str == "pairwise") { + if (control_str == "") { + contrast_mat <- matrix(nrow = l * (l - 1) / 2, ncol = l, dimnames = list(Contrasts = rep(NA, l * (l - 1) / 2), Levels = lvls)) + c <- 1 + for (i in 1:(l - 1)) { + for (j in (i + 1):l) { + comparison <- rep(0, l) + comparison[i] <- 1 + comparison[j] <- -1 + contrast_mat[c,] <- comparison + rownames(contrast_mat)[c] <- paste0(lvls[i], "-", lvls[j]) + c <- c + 1 + } + } + } else { + control <- which(as.character(lvls) == control_str) + if (length(control) == 0) { + stop("Control condition not part of found levels.n", call. = FALSE) + } + contrast_mat <- matrix(nrow = l - 1, ncol = l, dimnames = list(Contrasts = rep(NA, l - 1), Levels = lvls)) + c <- 1 + for (j in setdiff(1:l, control)) { + comparison <- rep(0, l) + comparison[i] <- -1 + comparison[j] <- 1 + contrast_mat[c,] <- comparison + rownames(contrast_mat)[c] <- paste0(lvls[i], "-", lvls[j]) + c <- c + 1 + } + } + } else { + contrast_lst <- unlist(strsplit(contrast_str, ";")) + contrast_mat <- make_contrasts(contrast_lst, lvls) + } + print("Contrasts to be tested:") + print(contrast_mat) + return(contrast_mat) +} + +#' This functions hels to define the contrasts that will be compare. +#' +#' @param contrasts +#' @param levels +#' +#' @return +make_contrasts <- function(contrasts, levels) { + #helper function + indicatorRow <- function(pos,len){ + row <- rep(0,len) + row[pos] <- 1 + return(row) + } + + if (is.factor(levels)) levels <- levels(levels) + if (!is.character(levels)) levels <- colnames(levels) + + l <- length(levels) + if (l < 1){ + stop("No levels given") + } + + ncontr <- length(contrasts) + if (ncontr < 1){ + stop("No contrasts given") + } + + levelsenv <- new.env() + for (i in 1:l) { + assign(levels[i], indicatorRow(i,l), pos=levelsenv) + } + + contrastmat <- matrix(0, l, ncontr, dimnames=list(Levels=levels,Contrasts=contrasts)) + for (j in 1:ncontr) { + contrastsj <- parse(text=contrasts[j]) + contrastmat[,j] <- eval(contrastsj, envir=levelsenv) + } + return(t(contrastmat)) +} + +#' Get missing samples by condition +#' +#' @param processedData +#' +#' @return +get_missing_in_condition <- function(processedData) { + p <- processedData + n_samples <- aggregate(p$SUBJECT, by = list(p$GROUP), FUN = function(x) {return(length(unique(as.numeric(x))))}) + colnames(n_samples) <- c("GROUP", "n_samples") + p <- p[complete.cases(p["LogIntensities"]),][,c("Protein", "GROUP", "SUBJECT")] + p_dup <- p[!duplicated(p),] + p_dup_agg <- aggregate(p_dup$SUBJECT, by = list(p_dup$Protein, p_dup$GROUP), length) + colnames(p_dup_agg) <- c("Protein", "GROUP", "non_na") + agg_join <- merge(p_dup_agg, n_samples, by = "GROUP") + agg_join$missingInCondition <- 1 - agg_join$non_na / agg_join$n_samples + + p <- dcast(setDT(agg_join), Protein~GROUP, value.var = "missingInCondition") + return(p) + } + +### End Function Sections +char_to_boolean <- c("true"=TRUE, "false"=FALSE) usage <- "Rscript msstats_tmt.R input.csv [list of contrasts or 'pairwise'] [default control condition or '']... [normalization based reference channel]" -if (length(args)<1) { - print(usage) - stop("At least the first argument must be supplied (input csv).n", call.=FALSE) -} -if (length(args)<2) { - # contrasts - args[2] <- "pairwise" -} -if (length(args)<3) { - # default control condition - args[3] <- "" -} -if (length(args)<4) { - # removeOneFeatProts - args[4] <- FALSE -} +args <- initialize_msstats(usage = usage) + rmProtein_with1Feature <- args[4] if(typeof(rmProtein_with1Feature) == 'character'){ rmProtein_with1Feature <- char_to_boolean[rmProtein_with1Feature] @@ -85,7 +214,6 @@ csv_input <- args[1] contrast_str <- args[2] control_str <- args[3] -require(MSstatsTMT) # read dataframe into MSstatsTMT data <- read.csv(csv_input) quant <- OpenMStoMSstatsTMTFormat(data, useUniquePeptide=useUniquePeptide, rmPSM_withfewMea_withinRun=rmPSM_withfewMea_withinRun, @@ -102,52 +230,11 @@ dataProcessPlotsTMT(processed.quant, "QCPlot", width=12, height=12, which.Protei lvls <- levels(as.factor(processed.quant$ProteinLevelData$Condition)) l <- length(lvls) -if (l == 1) -{ + +if (l == 1) { print("Only one condition found. No contrasts to be tested. If this is not the case, please check your experimental design.") } else { - if (contrast_str == "pairwise") - { - if (control_str == "") - { - contrast_mat <- matrix(nrow = l * (l-1) / 2, ncol = l, dimnames=list(Contrasts=rep(NA, l * (l-1) / 2), Levels=lvls)) - c <- 1 - for (i in 1:(l-1)) - { - for (j in (i+1):l) - { - comparison <- rep(0,l) - comparison[i] <- 1 - comparison[j] <- -1 - contrast_mat[c,] <- comparison - rownames(contrast_mat)[c] <- paste0(lvls[i],"-",lvls[j]) - c <- c+1 - } - } - } else { - control <- which(as.character(lvls) == control_str) - if (length(control) == 0) - { - stop("Control condition not part of found levels.n", call.=FALSE) - } - - contrast_mat <- matrix(nrow = l-1, ncol = l, dimnames=list(Contrasts=rep(NA, l-1),Levels=lvls)) - c <- 1 - for (j in setdiff(1:l,control)) - { - comparison <- rep(0,l) - comparison[i] <- -1 - comparison[j] <- 1 - contrast_mat[c,] <- comparison - rownames(contrast_mat)[c] <- paste0(lvls[i],"-",lvls[j]) - c <- c+1 - } - } - } else { - contrast_lst <- unlist(strsplit(contrast_str,";")) - contrast_mat <- make_contrasts(contrast_lst, lvls) - } - + contrast_mat <- parse_contrasts(l = l, contrast_str = contrast_str, lvls = lvls) print ("Contrasts to be tested:") print (contrast_mat) #TODO allow for user specified contrasts @@ -155,15 +242,4 @@ if (l == 1) #TODO allow manual input (e.g. proteins of interest) write.table(test.MSstatsTMT$ComparisonResult, file=paste0("msstatsiso_results.csv"), quote=FALSE, sep='\t', row.names = FALSE) - - #write comparison to CSV (one CSV per contrast) - # writeComparisonToCSV <- function(DF) - # { - # write.table(DF, file=paste0("comparison_",unique(DF$Label),".csv"), quote=FALSE, sep='\t', row.names = FALSE) - # return(DF) - # } - # ComparisonResultSplit <- split(test.MSstatsTMT$ComparisonResult, test.MSstatsTMT$ComparisonResult$Label) - # for(i in 1:length(ComparisonResultSplit)){ - # writeComparisonToCSV(ComparisonResultSplit[[i]]) - # } } diff --git a/bin/msstats_utils.R b/bin/msstats_utils.R index 86571110..9106d4b6 100644 --- a/bin/msstats_utils.R +++ b/bin/msstats_utils.R @@ -1,9 +1,10 @@ +### Begining Functions section + #' Inizialize the TMT and LFQ parameters #' #' @param usage message to exit the script analysis #' #' @return - initialize_msstats <- function(usage) { args <- commandArgs(trailingOnly = TRUE) if (length(args) < 1) { @@ -32,7 +33,7 @@ initialize_msstats <- function(usage) { #' @param l #' @param contrast_str #' @param lvls number of doncitions - +#' #' @return #' parse_contrasts <- function(l, contrast_str, lvls) { @@ -75,6 +76,70 @@ parse_contrasts <- function(l, contrast_str, lvls) { return(contrast_mat) } +#' This functions hels to define the contrasts that will be compare. +#' +#' @param contrasts +#' @param levels +#' +#' @return +make_contrasts <- function(contrasts, levels) { + #helper function + indicatorRow <- function(pos,len){ + row <- rep(0,len) + row[pos] <- 1 + return(row) + } + + if (is.factor(levels)) levels <- levels(levels) + if (!is.character(levels)) levels <- colnames(levels) + + l <- length(levels) + if (l < 1){ + stop("No levels given") + } + + ncontr <- length(contrasts) + if (ncontr < 1){ + stop("No contrasts given") + } + + levelsenv <- new.env() + for (i in 1:l) { + assign(levels[i], indicatorRow(i,l), pos=levelsenv) + } + + contrastmat <- matrix(0, l, ncontr, dimnames=list(Levels=levels,Contrasts=contrasts)) + for (j in 1:ncontr) { + contrastsj <- parse(text=contrasts[j]) + contrastmat[,j] <- eval(contrastsj, envir=levelsenv) + } + return(t(contrastmat)) +} + +#' Get missing samples by condition +#' +#' @param processedData +#' +#' @return +get_missing_in_condition <- function(processedData) { + p <- processedData + n_samples <- aggregate(p$SUBJECT, by = list(p$GROUP), FUN = function(x) {return(length(unique(as.numeric(x))))}) + colnames(n_samples) <- c("GROUP", "n_samples") + p <- p[complete.cases(p["LogIntensities"]),][,c("Protein", "GROUP", "SUBJECT")] + p_dup <- p[!duplicated(p),] + p_dup_agg <- aggregate(p_dup$SUBJECT, by = list(p_dup$Protein, p_dup$GROUP), length) + colnames(p_dup_agg) <- c("Protein", "GROUP", "non_na") + agg_join <- merge(p_dup_agg, n_samples, by = "GROUP") + agg_join$missingInCondition <- 1 - agg_join$non_na / agg_join$n_samples + + p <- dcast(setDT(agg_join), Protein~GROUP, value.var = "missingInCondition") + return(p) + } + +### End Function Sections + + + From 72f77af24e5fd88d717e0aa1047e99109a4f9b77 Mon Sep 17 00:00:00 2001 From: WangHong007 <88552471+WangHong007@users.noreply.github.com> Date: Thu, 30 Jun 2022 09:14:33 +0800 Subject: [PATCH 31/47] enable mzTab for DIA-NN --- bin/diann_convert.py | 448 ++++++++++++++++++++- modules/local/diannconvert/main.nf | 17 +- modules/local/diannsummary/main.nf | 1 + subworkflows/local/create_input_channel.nf | 1 + workflows/dia.nf | 3 +- workflows/quantms.nf | 2 +- 6 files changed, 460 insertions(+), 12 deletions(-) diff --git a/bin/diann_convert.py b/bin/diann_convert.py index fd8d09d3..86e29191 100755 --- a/bin/diann_convert.py +++ b/bin/diann_convert.py @@ -4,21 +4,72 @@ import click import os import re +import numpy as np +from Bio import SeqIO +from Bio.SeqUtils import molecular_weight from sdrf_pipelines.openms.unimod import UnimodDatabase -CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) +CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) @click.group(context_settings=CONTEXT_SETTINGS) def cli(): pass -@click.command('convert') +@click.command("convert") @click.option("--diann_report", "-r",) @click.option("--exp_design", "-e") +@click.option("--pg_matrix", "-pg") +@click.option("--pr_matrix", "-pr") +@click.option("--unique_matrix", "-un") +@click.option("--openms", "-o") +@click.option("--fasta", "-f") +@click.option("--charge", "-c") +@click.option("--missed_cleavages", "-m") @click.option("--qvalue_threshold", "-q", type=float) @click.pass_context -def convert(ctx, diann_report, exp_design, qvalue_threshold): - # Convert to MSstats - report = pd.read_csv(diann_report, sep = "\t", header = 0) + +def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, openms, fasta, charge, missed_cleavages, qvalue_threshold): + """This function is designed to convert the DIA-NN output into three standard formats: MSstats, Triqler and mzTab, for quality control + and downstream analysis. + + :param diann_report: Path to the main report output by DIA-NN + :type diann_report: str, optional + :param exp_design: Path to the experimental design file + :type exp_design: str, optional + :param pg_matrix: Path to a DIA-NN matrix file containing protein groups + :type pg_matrix: str, optional + :param pr_matrix: Path to a DIA-NN matrix file containing precursors + :type pr_matrix: str, optional + :param unique_matrix: Path to a DIA-NN matrix file containing unique genes + :type unique_matrix: str, optional + :param openms: Path to OpenMS, one of the workflow configuration files + :type openms: str, optional + :param fasta: Path to the fasta file + :type fasta: str, optional + :param charge: The charge assigned by DIA-NN(max_precursor_charge) + :type charge: int, optional + :param missed_cleavages: Allowed missed cleavages assigned by DIA-NN + :type missed_cleavages: int, optional + :param qvalue_threshold: Threshold for filtering q value + :type qvalue_threshold: float, optional + """ + + pg = pd.read_csv(pg_matrix, sep = "\t", header = 0, dtype = "str") + pr = pd.read_csv(pr_matrix, sep = "\t", header = 0, dtype = "str") + unique = pd.read_csv(unique_matrix, sep = "\t", header = 0, dtype = "str") + opms = pd.read_csv(openms, sep = "\t", header = 0, dtype = "str") + opms.fillna("null", inplace=True) + report = pd.read_csv(diann_report, sep = "\t", header = 0, dtype = "str") + report["Precursor.Mz"] = report.apply(lambda x: molecular_weight(x["Stripped.Sequence"], 'protein', monoisotopic=True) / int(x["Precursor.Charge"]), axis=1) + precursor_list = list(report["Precursor.Id"].unique()) + report["precursor.Index"] = report.apply(lambda x: precursor_list.index(x["Precursor.Id"]), axis=1) + + col = ["Q.Value", "Precursor.Normalised", "RT", "Global.Q.Value", "Lib.Q.Value", "PG.MaxLFQ"] + for i in col: + report.loc[:, i] = report.loc[:, i].astype('float') + + # filter based on qvalue parameter for downstream analysiss + report = report[report["Q.Value"] < qvalue_threshold] + unimod_data = UnimodDatabase() with open(exp_design, 'r') as f: data = f.readlines() @@ -32,9 +83,7 @@ def convert(ctx, diann_report, exp_design, qvalue_threshold): s_header = data[empty_row + 1].replace("\n", "").split("\t") s_DataFrame = pd.DataFrame(s_table, columns=s_header) - # filter based on qvalue parameter for downstream analysiss - report = report[report["Q.Value"] < qvalue_threshold] - + # Convert to MSstats out_msstats = pd.DataFrame() out_msstats = report[['Protein.Names', 'Modified.Sequence', 'Precursor.Charge', 'Precursor.Quantity', 'File.Name','Run']] out_msstats.columns = ['ProteinName', 'PeptideSequence', 'PrecursorCharge', 'Intensity', 'Reference', 'Run'] @@ -60,10 +109,40 @@ def convert(ctx, diann_report, exp_design, qvalue_threshold): out_triqler = out_triqler[out_triqler["intensity"] != 0] out_triqler.to_csv(os.path.splitext(os.path.basename(exp_design))[0] + '_triqler_in.tsv', sep='\t', index=False) + # Convert to mzTab + fasta_pd = pd.DataFrame() + line = 0 + for seq_record in SeqIO.parse(fasta,"fasta"): + fasta_pd.loc[line, "id"] = seq_record.id + fasta_pd.loc[line, "seq"] = re.sub("\(|\)|\,", "", str(seq_record.seq)) + fasta_pd.loc[line, "len"] = len(seq_record) + line += 1 + + index_ref = f_table + index_ref.loc[:, "ms_run"] = index_ref.apply(lambda x: x["Fraction_Group"], axis=1) + index_ref.loc[:, "study_variable"] = index_ref.apply(lambda x: x["Sample"], axis=1) + index_ref.loc[:, "ms_run"] = index_ref.loc[:, "ms_run"].astype("int") + index_ref.loc[:, "study_variable"] = index_ref.loc[:, "study_variable"].astype("int") + report[["ms_run", "study_variable"]] = report.apply(lambda x: add_info(x["Run"], index_ref), axis = 1, result_type = "expand") + + (MTD, database) = mztab_MTD(index_ref, opms, unimod_data, fasta, charge, missed_cleavages) + PRH = mztab_PRH(report, pg, unique, index_ref, database, fasta_pd) + PEH = mztab_PEH(report, pr, unique, unimod_data, precursor_list, index_ref, database) + PSH = mztab_PSH(unique, unimod_data, report, database) + MTD.loc["", :] = "" + PRH.loc[len(PRH) + 1, :] = "" + PEH.loc[len(PEH) + 1, :] = "" + with open(os.path.splitext(os.path.basename(exp_design))[0] + '_out.mztab', "w", newline = "") as f: + MTD.to_csv(f, mode="w", sep = '\t', index = False, header = False) + PRH.to_csv(f, mode="w", sep = '\t', index = False, header = True) + PEH.to_csv(f, mode="w", sep = '\t', index = False, header = True) + PSH.to_csv(f, mode="w", sep = '\t', index = False, header = True) + + def query_expdesign_value(reference, f_table, s_table): query_reference = f_table[f_table["run"] == reference] Fraction = query_reference["Fraction"].values[0] - row = s_table[s_table["Sample"] == query_reference['Sample'].values[0]] + row = s_table[s_table["Sample"] == query_reference["Sample"].values[0]] BioReplicate = row["MSstats_BioReplicate"].values[0] Condition = row["MSstats_Condition"].values[0] @@ -80,6 +159,357 @@ def convert_modification(peptide, unimod_data): peptide = peptide + "." return peptide +def MTD_mod_info(unimod_database, fix_mod, var_mod): + pattern = re.compile("\((.*?)\)") + var_ptm = [] + fix_ptm = [] + + if fix_mod != "null": + fix_flag = 1 + for mod in fix_mod.split(","): + for m in unimod_database.modifications: + if m.get_name() == mod.split(" ")[0]: + mod_name = m.get_name() + mod_accession = m.get_accession() + break + site = re.findall(pattern, " ".join(mod.split(" ")[1:]))[0] + fix_ptm.append(("[UNIMOD, " + mod_accession.upper() + ", " + mod_name + ", ]", site)) + else: + fix_flag = 0 + fix_ptm = ["[MS, MS:1002453, No fixed modifications searched, ]"] + + if var_mod != "null": + var_flag = 1 + for mod in var_mod.split(","): + for m in unimod_database.modifications: + if m.get_name() == mod.split(" ")[0]: + mod_name = m.get_name() + mod_accession = m.get_accession() + break + site = re.findall(pattern, " ".join(mod.split(" ")[1:]))[0] + var_ptm.append(("[UNIMOD, " + mod_accession.upper() + ", " + mod_name + ", ]", site)) + else: + var_flag = 0 + var_ptm = ["[MS, MS:1002454, No variable modifications searched, ]"] + + return fix_ptm, var_ptm, fix_flag, var_flag + + +def mztab_MTD(index_ref, opms, unimod_data, fasta, charge, missed_cleavages): + FragmentMassTolerance = opms["FragmentMassTolerance"].values[0] + FragmentMassToleranceUnit = opms["FragmentMassToleranceUnit"].values[0] + PrecursorMassTolerance = opms["PrecursorMassTolerance"].values[0] + PrecursorMassToleranceUnit = opms["PrecursorMassToleranceUnit"].values[0] + Enzyme = opms["Enzyme"].values[0] + FixedModifications = opms["FixedModifications"].values[0] + VariableModifications = opms["VariableModifications"].values[0] + + out_mztab_MTD = pd.DataFrame() + out_mztab_MTD.loc[1, "mzTab-version"] = "1.0.0" + out_mztab_MTD.loc[1, "mzTab-mode"] = "Summary" + out_mztab_MTD.loc[1, "mzTab-type"] = "Quantification" + out_mztab_MTD.loc[1, "title"] = "ConsensusMap export from OpenMS" + out_mztab_MTD.loc[1, "description"] = "OpenMS export from consensusXML" + out_mztab_MTD.loc[1, "protein_search_engine_score[1]"] = "[, , DIA-NN Global.PG.Q.Value, ]" + out_mztab_MTD.loc[1, "peptide_search_engine_score[1]"] = "[, , DIA-NN Q.Value (minimum of the respective precursor q-values), ]" + out_mztab_MTD.loc[1, "psm_search_engine_score[1]"] = "[MS, MS:MS:1001869, protein-level q-value, ]" + out_mztab_MTD.loc[1, "software[1]"] = "[MS, MS:1003253, DIA-NN, Release (v1.8.1)]" + out_mztab_MTD.loc[1, "software[1]-setting[1]"] = fasta + out_mztab_MTD.loc[1, "software[1]-setting[2]"] = "db_version:null" + out_mztab_MTD.loc[1, "software[1]-setting[3]"] = "fragment_mass_tolerance:" + FragmentMassTolerance + out_mztab_MTD.loc[1, "software[1]-setting[4]"] = "fragment_mass_tolerance_unit:" + FragmentMassToleranceUnit + out_mztab_MTD.loc[1, "software[1]-setting[5]"] = "precursor_mass_tolerance:" + PrecursorMassTolerance + out_mztab_MTD.loc[1, "software[1]-setting[6]"] = "precursor_mass_tolerance_unit:" + PrecursorMassToleranceUnit + out_mztab_MTD.loc[1, "software[1]-setting[7]"] = "enzyme:" + Enzyme + out_mztab_MTD.loc[1, "software[1]-setting[8]"] = "enzyme_term_specificity:full" + out_mztab_MTD.loc[1, "software[1]-setting[9]"] = "charges:" + str(charge) + out_mztab_MTD.loc[1, "software[1]-setting[10]"] = "missed_cleavages:" + str(missed_cleavages) + out_mztab_MTD.loc[1, "software[1]-setting[11]"] = "fixed_modifications:" + FixedModifications + out_mztab_MTD.loc[1, "software[1]-setting[12]"] = "variable_modifications:" + VariableModifications + + (fixed_mods, variable_mods, fix_flag, var_flag) = MTD_mod_info(unimod_data, FixedModifications, VariableModifications) + if fix_flag == 1: + for i in range(1, len(fixed_mods) + 1): + out_mztab_MTD.loc[1, "fixed_mod[" + str(i) + "]"] = fixed_mods[i - 1][0] + out_mztab_MTD.loc[1, "fixed_mod[" + str(i) + "]-site"] = fixed_mods[i - 1][1] + out_mztab_MTD.loc[1, "fixed_mod[" + str(i) + "]-position"] = "Anywhere" + else: + out_mztab_MTD.loc[1, "fixed_mod[1]"] = fixed_mods[0] + + if var_flag == 1: + for i in range(1, len(variable_mods) + 1): + out_mztab_MTD.loc[1, "variable_mod[" + str(i) + "]"] = variable_mods[i - 1][0] + out_mztab_MTD.loc[1, "variable_mod[" + str(i) + "]-site"] = variable_mods[i - 1][1] + out_mztab_MTD.loc[1, "variable_mod[" + str(i) + "]-position"] = "Anywhere" + else: + out_mztab_MTD.loc[1, "variable_mod[1]"] = variable_mods[0] + + out_mztab_MTD.loc[1, "quantification_method"] = "[MS, MS:1001834, LC-MS label-free quantitation analysis, ]" + out_mztab_MTD.loc[1, "protein-quantification_unit"] = "[, , Abundance, ]" + out_mztab_MTD.loc[1, "peptide-quantification_unit"] = "[, , Abundance, ]" + + for i in range(1, max(index_ref["ms_run"]) + 1): + out_mztab_MTD.loc[1, "ms_run[" + str(i) + "]-format"] = "[MS, MS:1000584, mzML file, ]" + out_mztab_MTD.loc[1, "ms_run[" + str(i) + "]-location"] = "file://" + index_ref[index_ref["ms_run"] == i]["Spectra_Filepath"].values[0] + out_mztab_MTD.loc[1, "ms_run[" + str(i) + "]-id_format"] = "[MS, MS:1000777, spectrum identifier nativeID format, ]" + out_mztab_MTD.loc[1, "assay[" + str(i) + "]-quantification_reagent"] = "[MS, MS:1002038, unlabeled sample, ]" + out_mztab_MTD.loc[1, "assay[" + str(i) + "]-ms_run_ref"] = "ms_run[" + str(i) + "]" + + for i in range(1, max(index_ref["study_variable"]) + 1): + study_variable = [] + for j in list(index_ref[index_ref["study_variable"] == i]["ms_run"].values): + study_variable.append("assay[" + str(j) + "]") + out_mztab_MTD.loc[1, "study_variable[" + str(i) + "]-assay_refs"] = ",".join(study_variable) + out_mztab_MTD.loc[1, "study_variable[" + str(i) + "]-description"] = "no description given" + + out_mztab_MTD.loc[2, :] = "MTD" + col = list(out_mztab_MTD.columns) + row = list(out_mztab_MTD.index) + out_mztab_MTD_T = pd.DataFrame(out_mztab_MTD.values.T, index = col, columns = row) + out_mztab_MTD_T.columns = ["inf", "index"] + out_mztab_MTD_T.insert(0, "title", out_mztab_MTD_T.index) + index = out_mztab_MTD_T.loc[:, "index"] + out_mztab_MTD_T.drop(labels = ["index"], axis = 1, inplace = True) + out_mztab_MTD_T.insert(0, "index", index) + database = os.path.basename(fasta.split(".")[-2]) + + return out_mztab_MTD_T, database + + +def mztab_PRH(report, pg, unique, index_ref, database, fasta_pd): + file = list(pg.columns[5:]) + col = {} + for i in file: + col[i] = "protein_abundance_assay[" + str(index_ref[index_ref["run"] == i.split("/")[-1].split(".")[-2]]["ms_run"].values[0]) + "]" + + pg = pg.rename(columns = col) + out_mztab_PRH = pd.DataFrame() + out_mztab_PRH = pg.drop(["Protein.Group", "Protein.Names"], axis = 1) + out_mztab_PRH = out_mztab_PRH.rename(columns = {"Protein.Ids":"accession", "First.Protein.Description":"description"}) + out_mztab_PRH.loc[:, "PRH"] = "PRT" + index = out_mztab_PRH.loc[:, "PRH"] + out_mztab_PRH.drop(labels = ["PRH"], axis = 1, inplace = True) + out_mztab_PRH.insert(0, "PRH", index) + out_mztab_PRH.loc[:, "database"] = database + + null_col = ["taxid", "species", "database_version", "search_engine", "opt_global_Posterior_Probability_score", + "opt_global_nr_found_peptides", "opt_global_cv_PRIDE:0000303_decoy_hit"] + for i in null_col: + out_mztab_PRH.loc[:, i] = "null" + + out_mztab_PRH.loc[:, "protein_coverage"] = out_mztab_PRH.apply( + lambda x: calculate_protein_coverage(report, x["accession"], fasta_pd), axis=1, result_type="expand") + + out_mztab_PRH.loc[:, "ambiguity_members"] = out_mztab_PRH.loc[:, "accession"] + + out_mztab_PRH[["modifiedSequence", "best_search_engine_score[1]"]] = out_mztab_PRH.apply( + lambda x: PRH_match_report(report, x["accession"]), axis=1, result_type="expand") + + out_mztab_PRH.loc[:, "modifications"] = out_mztab_PRH.apply( + lambda x: find_modification(x["modifiedSequence"]), axis=1, result_type="expand") + + ## quantity at protein level: PG.MaxLFQ + ## protein_abundance_study_variable[n] + max_study_variable = max(index_ref["study_variable"]) + PRH_params = [] + for i in range(1, max_study_variable + 1): + PRH_params.extend(["protein_abundance_study_variable[" + str(i) + "]", "protein_abundance_stdev_study_variable[" + str(i) + "]", + "protein_abundance_std_error_study_variable[" + str(i) + "]"]) + + out_mztab_PRH[PRH_params] = out_mztab_PRH.apply( + lambda x: match_in_report(report, x['accession'], max_study_variable, 0, 'protein'), axis=1, result_type='expand') + + out_mztab_PRH.loc[:, "opt_global_result_type"] = out_mztab_PRH.apply( + lambda x: classify_protein(x["Genes"], unique["Genes"], "single_protein", "indistinguishable_protein_group"), axis=1, result_type="expand") + + out_mztab_PRH = out_mztab_PRH.drop(["Genes", "modifiedSequence"], axis=1) + out_mztab_PRH.fillna("null", inplace=True) + out_mztab_PRH.to_csv("./out_protein.mztab", sep=",", index=False) + + return out_mztab_PRH + + +def mztab_PEH(report, pr, unique, unimod_data, precursor_list, index_ref, database): + out_mztab_PEH = pd.DataFrame() + out_mztab_PEH = pr.iloc[:, 0:10] + out_mztab_PEH = out_mztab_PEH.drop(["Protein.Group", "Protein.Names", "First.Protein.Description", "Proteotypic"], axis = 1) + out_mztab_PEH = out_mztab_PEH.rename(columns = {"Stripped.Sequence":"sequence", "Protein.Ids":"accession", + "Modified.Sequence":"opt_global_cv_MS:1000889_peptidoform_sequence", "Precursor.Charge":"charge"}) + out_mztab_PEH.loc[:, "PEH"] = "PEP" + index = out_mztab_PEH.loc[:, "PEH"] + out_mztab_PEH.drop(labels = ["PEH"], axis = 1, inplace = True) + out_mztab_PEH.insert(0, "PEH", index) + out_mztab_PEH.loc[:, "database"] = database + + out_mztab_PEH.loc[:, "modifications"] = out_mztab_PEH.apply( + lambda x: find_modification(x["opt_global_cv_MS:1000889_peptidoform_sequence"]), axis=1, result_type="expand") + + out_mztab_PEH.loc[:, "opt_global_cv_MS:1000889_peptidoform_sequence"] = out_mztab_PEH.apply( + lambda x: convert_modification(x["opt_global_cv_MS:1000889_peptidoform_sequence"], unimod_data), axis=1) + + out_mztab_PEH.loc[:, "unique"] = out_mztab_PEH.apply(lambda x: classify_protein(x["Genes"], unique["Genes"], "1", "0"), axis=1, result_type="expand") + + null_col = ["database_version", "search_engine", "retention_time_window"] + for i in null_col: + out_mztab_PEH.loc[:, i] = "null" + out_mztab_PEH.loc[:, "opt_global_cv_MS:1002217_decoy_peptide"] = "0" + + ## average value of each study_variable + ## quantity at peptide level: Precursor.Normalised + out_mztab_PEH.loc[:, 'pr_id'] = out_mztab_PEH.apply(lambda x: precursor_list.index(x["Precursor.Id"]), axis = 1, result_type = 'expand') + max_assay = max(index_ref['ms_run']) + max_study_variable = max(index_ref['study_variable']) + + ms_run_score = [] + for i in range(1, max_assay + 1): + ms_run_score.append('search_engine_score[1]_ms_run[' + str(i) + ']') + out_mztab_PEH[ms_run_score] = out_mztab_PEH.apply(lambda x: match_in_report(report, x['pr_id'], max_assay, 0, 'pep'), axis = 1, result_type = 'expand') + + PEH_params = [] + for i in range(1, max_study_variable + 1): + PEH_params.extend(['peptide_abundance_study_variable[' + str(i) + ']', 'peptide_abundance_stdev_study_variable[' + str(i) + ']', + 'peptide_abundance_std_error_study_variable[' + str(i) + ']', 'opt_global_mass_to_charge_study_variable[' + str(i) + ']', + 'opt_global_retention_time_study_variable[' + str(i) + ']']) + out_mztab_PEH[PEH_params] = out_mztab_PEH.apply( + lambda x: match_in_report(report, x['pr_id'], max_study_variable, 1, 'pep'), axis=1, result_type='expand') + + ## ["Q.Value", "RT", "Global.Q.Value", "Lib.Q.Value", "Precursor.Mz"] + out_mztab_PEH[["best_search_engine_score[1]", "retention_time", "opt_global_q-value", "opt_global_SpecEValue_score", "mass_to_charge" + ]] = out_mztab_PEH.apply(lambda x: PEH_match_report(report, x['pr_id']), axis=1, result_type="expand") + + out_mztab_PEH[["opt_global_feature_id", "spectra_ref"]] = out_mztab_PEH.apply(lambda x:("null", "null"),axis = 1, result_type = "expand") + out_mztab_PEH = out_mztab_PEH.drop(["Precursor.Id", "Genes", "pr_id"], axis = 1) + out_mztab_PEH.fillna("null", inplace = True) + out_mztab_PEH.to_csv("./out_peptide.mztab", sep=",", index=False) + + return out_mztab_PEH + + +def mztab_PSH(unique, unimod_data, report, database): + out_mztab_PSH = pd.DataFrame() + ## Score at PSM level: Q.Value + out_mztab_PSH = report[["Stripped.Sequence", "Protein.Ids", "Genes", "Q.Value", "RT", + "Precursor.Charge", "Precursor.Mz", "Modified.Sequence", "PEP", "Global.Q.Value", "Global.Q.Value"]] + out_mztab_PSH.columns = ["sequence", "accession", "Genes", "search_engine_score[1]", "retention_time", "charge", "calc_mass_to_charge", + "opt_global_cv_MS:1000889_peptidoform_sequence", "opt_global_SpecEValue_score", "opt_global_q-value", "opt_global_q-value_score"] + + out_mztab_PSH.loc[:, "opt_global_cv_MS:1002217_decoy_peptide"] = "0" + out_mztab_PSH.loc[:, "PSH"] = "PSM" + index = out_mztab_PSH.loc[:, "PSH"] + out_mztab_PSH.drop(labels = ["PSH"], axis = 1, inplace = True) + out_mztab_PSH.insert(0, "PSH", index) + out_mztab_PSH.loc[:, "PSM_ID"] = out_mztab_PSH.index + out_mztab_PSH.loc[:, "unique"] = out_mztab_PSH.apply(lambda x: classify_protein(x["Genes"], unique["Genes"], "1", "0"), axis=1, result_type="expand") + out_mztab_PSH.loc[:, "database"] = database + + null_col = ["database_version", "spectra_ref", "search_engine", "exp_mass_to_charge", "pre", "post", + "start", "end", "opt_global_feature_id", "opt_global_map_index", "opt_global_spectrum_reference"] + for i in null_col: + out_mztab_PSH.loc[:, i] = "null" + + out_mztab_PSH.loc[:, "modifications"] = out_mztab_PSH.apply( + lambda x: find_modification(x["opt_global_cv_MS:1000889_peptidoform_sequence"]), axis=1, result_type="expand") + + out_mztab_PSH.loc[:, "opt_global_cv_MS:1000889_peptidoform_sequence"] = out_mztab_PSH.apply( + lambda x: convert_modification(x["opt_global_cv_MS:1000889_peptidoform_sequence"], unimod_data), axis=1, result_type="expand") + + out_mztab_PSH = out_mztab_PSH.drop(["Genes"], axis = 1) + out_mztab_PSH.fillna("null", inplace = True) + out_mztab_PSH.to_csv("./out_psms.mztab", sep=",", index=False) + + return out_mztab_PSH + + +def add_info(target, index_ref): + match = index_ref[index_ref["run"] == target] + ms_run = match["ms_run"].values[0] + study_variable = match["study_variable"].values[0] + + return ms_run, study_variable + + +def classify_protein(target, unique, t, f): + if any(unique == target): + return t + else: + return f + + +def calculate_protein_coverage(report, target, fasta_pd): + protein_coverage = "" + len_current = len(max(report[report["Protein.Ids"] == target]["Stripped.Sequence"].values, key = len)) + for i in target.split(";"): + len_original = fasta_pd[fasta_pd["id"].str.contains(i)]["len"].values[0] + protein_coverage += format(len_current / len_original, ".3f") + ";" + + return protein_coverage.strip(";") + + +def match_in_report(report, target, run, flag, level): + if flag == 1 and level == 'pep': + result = report[report['precursor.Index'] == target] + PEH_params = [] + for i in range(1, run + 1): + match = result[result['study_variable'] == i] + PEH_params.extend([match['Precursor.Normalised'].mean(), 'null', 'null', 'null', match['RT'].mean()]) + + return tuple(PEH_params) + + elif flag == 0 and level == 'pep': + result = report[report['precursor.Index'] == target] + q_value = [] + for i in range(1, run + 1): + match = result[result['ms_run'] == i] + q_value.append(match['Q.Value'].values[0] if match['Q.Value'].values.size > 0 else np.nan) + + return tuple(q_value) + + elif flag == 0 and level == 'protein': + result = report[report['Protein.Ids'] == target] + PRH_params = [] + for i in range(1, run + 1): + match = result[result['study_variable'] == i] + PRH_params.extend([match['PG.MaxLFQ'].mean(), 'null', 'null']) + + return tuple(PRH_params) + + +def PRH_match_report(report, target): + match = report[report["Protein.Ids"] == target] + modSeq = match["Modified.Sequence"].values[0] if match["Modified.Sequence"].values.size > 0 else np.nan + ## Score at protein level: Global.PG.Q.Value (without MBR) + score = match["Global.PG.Q.Value"].min() + + return modSeq, score + + +def PEH_match_report(report, target): + match = report[report["precursor.Index"] == target] + ## Score at peptide level: the minimum of the respective precursor q-values (minimum of Q.Value per group) + search_score = match["Q.Value"].min() + time = match["RT"].mean() + q_score = match["Global.Q.Value"].values[0] if match["Global.Q.Value"].values.size > 0 else np.nan + spec_e = match["Lib.Q.Value"].values[0] if match["Lib.Q.Value"].values.size > 0 else np.nan + mz = match["Precursor.Mz"].mean() + + return search_score, time, q_score, spec_e, mz + + +def find_modification(peptide): + pattern = re.compile(r"\((.*?)\)") + original_mods = re.findall(pattern, peptide) + peptide = re.sub('\(.*?\)','.',peptide) + position = [i.start() for i in re.finditer("\.", peptide)] + for j in range(1, len(position)): + position[j] -= j + + for k in range(0,len(original_mods)): + original_mods[k] = str(position[k]) + "-" + original_mods[k].upper() + + original_mods = ",".join(str(i) for i in original_mods) + + return original_mods if len(original_mods) > 0 else "null" + cli.add_command(convert) diff --git a/modules/local/diannconvert/main.nf b/modules/local/diannconvert/main.nf index 5a5cbfaa..225e114c 100644 --- a/modules/local/diannconvert/main.nf +++ b/modules/local/diannconvert/main.nf @@ -12,10 +12,18 @@ process DIANNCONVERT { input: path(report) path(exp_design) + path(report_pg) + path(report_pr) + path(report_unique_gene) + path(openms) + path(fasta) + val(charge) + val(missed_cleavages) output: path "*msstats_in.csv", emit: out_msstats path "*triqler_in.tsv", emit: out_triqler + path "*out.mztab", emit: out_mztab path "versions.yml", emit: version script: @@ -25,8 +33,15 @@ process DIANNCONVERT { diann_convert.py convert \\ --diann_report ${report} \\ --exp_design ${exp_design} \\ + --pg_matrix ${report_pg} \\ + --pr_matrix ${report_pr} \\ + --unique_matrix ${report_unique_gene} \\ + --openms ${openms} \\ + --fasta ${fasta} \\ + --charge ${charge} \\ + --missed_cleavages ${missed_cleavages} \\ --qvalue_threshold $params.protein_level_fdr_cutoff \\ - |& tee trans_to_msstats.log + |& tee convert_report.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/diannsummary/main.nf b/modules/local/diannsummary/main.nf index a326e20d..2d24f88b 100644 --- a/modules/local/diannsummary/main.nf +++ b/modules/local/diannsummary/main.nf @@ -16,6 +16,7 @@ process DIANNSUMMARY { path "diann_report.pr_matrix.tsv", emit: pr_matrix path "diann_report.pg_matrix.tsv", emit: pg_matrix path "diann_report.gg_matrix.tsv", emit: gg_matrix + path "diann_report.unique_genes_matrix.tsv", emit: unique_gene_matrix path "diannsummary.log", emit: log path "versions.yml", emit: version diff --git a/subworkflows/local/create_input_channel.nf b/subworkflows/local/create_input_channel.nf index e875b12d..8269ee66 100644 --- a/subworkflows/local/create_input_channel.nf +++ b/subworkflows/local/create_input_channel.nf @@ -54,6 +54,7 @@ workflow CREATE_INPUT_CHANNEL { ch_meta_config_iso // [meta, [spectra_files ]] ch_meta_config_lfq // [meta, [spectra_files ]] ch_meta_config_dia // [meta, [spectra files ]] + ch_config ch_expdesign version = ch_versions diff --git a/workflows/dia.nf b/workflows/dia.nf index eda8fdaa..00cabbab 100644 --- a/workflows/dia.nf +++ b/workflows/dia.nf @@ -34,6 +34,7 @@ workflow DIA { take: ch_file_preparation_results ch_expdesign + ch_config main: @@ -85,7 +86,7 @@ workflow DIA { // // MODULE: DIANNCONVERT // - DIANNCONVERT(DIANNSUMMARY.out.main_report, ch_expdesign) + DIANNCONVERT(DIANNSUMMARY.out.main_report, ch_expdesign, DIANNSUMMARY.out.pg_matrix, DIANNSUMMARY.out.pr_matrix, DIANNSUMMARY.out.unique_gene_matrix, ch_config, params.database, params.max_precursor_charge, params.allowed_missed_cleavages) ch_software_versions = ch_software_versions.mix(DIANNCONVERT.out.version.ifEmpty(null)) // diff --git a/workflows/quantms.nf b/workflows/quantms.nf index 0970cd88..529ba9b0 100644 --- a/workflows/quantms.nf +++ b/workflows/quantms.nf @@ -151,7 +151,7 @@ workflow QUANTMS { ch_msstats_in = ch_msstats_in.mix(LFQ.out.msstats_in) ch_versions = ch_versions.mix(LFQ.out.versions.ifEmpty(null)) - DIA(ch_fileprep_result.dia, CREATE_INPUT_CHANNEL.out.ch_expdesign) + DIA(ch_fileprep_result.dia, CREATE_INPUT_CHANNEL.out.ch_expdesign, CREATE_INPUT_CHANNEL.out.ch_config) ch_pipeline_results = ch_pipeline_results.mix(DIA.out.diann_report) ch_msstats_in = ch_msstats_in.mix(DIA.out.msstats_in) ch_versions = ch_versions.mix(DIA.out.versions.ifEmpty(null)) From abbd2c1b0727f969bb889b0c496c4e15883c16e8 Mon Sep 17 00:00:00 2001 From: WangHong007 <88552471+WangHong007@users.noreply.github.com> Date: Sun, 3 Jul 2022 16:27:49 +0800 Subject: [PATCH 32/47] Update diann_convert.py --- bin/diann_convert.py | 246 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 216 insertions(+), 30 deletions(-) diff --git a/bin/diann_convert.py b/bin/diann_convert.py index 86e29191..472a573f 100755 --- a/bin/diann_convert.py +++ b/bin/diann_convert.py @@ -28,29 +28,29 @@ def cli(): @click.pass_context def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, openms, fasta, charge, missed_cleavages, qvalue_threshold): - """This function is designed to convert the DIA-NN output into three standard formats: MSstats, Triqler and mzTab, for quality control - and downstream analysis. + """This function is designed to convert the DIA-NN output into three standard formats: MSstats, Triqler and mzTab. These documents are + used for quality control and downstream analysis. :param diann_report: Path to the main report output by DIA-NN - :type diann_report: str, optional + :type diann_report: str :param exp_design: Path to the experimental design file - :type exp_design: str, optional + :type exp_design: str :param pg_matrix: Path to a DIA-NN matrix file containing protein groups - :type pg_matrix: str, optional + :type pg_matrix: str :param pr_matrix: Path to a DIA-NN matrix file containing precursors - :type pr_matrix: str, optional + :type pr_matrix: str :param unique_matrix: Path to a DIA-NN matrix file containing unique genes - :type unique_matrix: str, optional + :type unique_matrix: str :param openms: Path to OpenMS, one of the workflow configuration files - :type openms: str, optional + :type openms: str :param fasta: Path to the fasta file - :type fasta: str, optional + :type fasta: str :param charge: The charge assigned by DIA-NN(max_precursor_charge) - :type charge: int, optional + :type charge: int :param missed_cleavages: Allowed missed cleavages assigned by DIA-NN - :type missed_cleavages: int, optional + :type missed_cleavages: int :param qvalue_threshold: Threshold for filtering q value - :type qvalue_threshold: float, optional + :type qvalue_threshold: float """ pg = pd.read_csv(pg_matrix, sep = "\t", header = 0, dtype = "str") @@ -59,7 +59,7 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, opms = pd.read_csv(openms, sep = "\t", header = 0, dtype = "str") opms.fillna("null", inplace=True) report = pd.read_csv(diann_report, sep = "\t", header = 0, dtype = "str") - report["Precursor.Mz"] = report.apply(lambda x: molecular_weight(x["Stripped.Sequence"], 'protein', monoisotopic=True) / int(x["Precursor.Charge"]), axis=1) + report["Calculate.Precursor.Mz"] = report.apply(lambda x: molecular_weight(x["Stripped.Sequence"], 'protein', monoisotopic=True) / int(x["Precursor.Charge"]), axis=1) precursor_list = list(report["Precursor.Id"].unique()) report["precursor.Index"] = report.apply(lambda x: precursor_list.index(x["Precursor.Id"]), axis=1) @@ -140,6 +140,19 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, def query_expdesign_value(reference, f_table, s_table): + """By matching the "Run" column in f_table or the "Sample" column in s_table, this function returns a tuple containing Fraction, + BioReplicate and Condition. + + :param reference: The "Run" column in out_msstats + :type reference: pandas.core.series.Series + :param f_table: A table contains experiment settings(search engine settings etc.) + :type f_table: pandas.core.frame.DataFrame + :param s_table: A table contains experimental design + :type s_table: pandas.core.frame.DataFrame + :return: A tuple contains Fraction, BioReplicate and Condition + :rtype: tuple + """ + query_reference = f_table[f_table["run"] == reference] Fraction = query_reference["Fraction"].values[0] row = s_table[s_table["Sample"] == query_reference["Sample"].values[0]] @@ -150,6 +163,16 @@ def query_expdesign_value(reference, f_table, s_table): def convert_modification(peptide, unimod_data): + """Convert the accession in the peptide to name("Unimod:35" to "Oxidation" etc.). + + :param peptide: Peptide contains the modification accessions + :type peptide: str + :param unimod_data: An instance of class UnimodDatabase + :type unimod_data: sdrf_pipelines.openms.unimod.UnimodDatabase + :return: Peptide contains the modification names + :rtype: str + """ + pattern = re.compile(r"\((.*?)\)") origianl_mods = re.findall(pattern, peptide) for mod in set(origianl_mods): @@ -159,7 +182,20 @@ def convert_modification(peptide, unimod_data): peptide = peptide + "." return peptide + def MTD_mod_info(unimod_database, fix_mod, var_mod): + """Convert fixed and variable modifications to the format required by the MTD sub-table. + + :param unimod_database: An instance of class UnimodDatabase + :type unimod_database: sdrf_pipelines.openms.unimod.UnimodDatabase + :param fix_mod: Fixed modifications from openms.tsv + :type fix_mod: str + :param var_mod: Variable modifications from openms.tsv + :type var_mod: str + :return: A tuple contains fixed and variable modifications, and flags indicating whether they are null + :rtype: tuple + """ + pattern = re.compile("\((.*?)\)") var_ptm = [] fix_ptm = [] @@ -176,7 +212,7 @@ def MTD_mod_info(unimod_database, fix_mod, var_mod): fix_ptm.append(("[UNIMOD, " + mod_accession.upper() + ", " + mod_name + ", ]", site)) else: fix_flag = 0 - fix_ptm = ["[MS, MS:1002453, No fixed modifications searched, ]"] + fix_ptm.append("[MS, MS:1002453, No fixed modifications searched, ]") if var_mod != "null": var_flag = 1 @@ -190,12 +226,30 @@ def MTD_mod_info(unimod_database, fix_mod, var_mod): var_ptm.append(("[UNIMOD, " + mod_accession.upper() + ", " + mod_name + ", ]", site)) else: var_flag = 0 - var_ptm = ["[MS, MS:1002454, No variable modifications searched, ]"] + var_ptm.append("[MS, MS:1002454, No variable modifications searched, ]") return fix_ptm, var_ptm, fix_flag, var_flag def mztab_MTD(index_ref, opms, unimod_data, fasta, charge, missed_cleavages): + """Construct MTD sub-table. + + :param index_ref: On the basis of f_table, two columns "MS_run" and "study_variable" are added for matching + :type indx_ref: pandas.core.frame.DataFrame + :param opms: Dataframe for openms.tsv + :type opms: pandas.core.frame.DataFrame + :param unimod_data: An instance of class UnimodDatabase + :type unimod_data: sdrf_pipelines.openms.unimod.UnimodDatabase + :param fasta: Fasta file path + :type fasta: str + :param charge: Charges set by Dia-NN + :type charge: int + :param missed_cleavages: Missed cleavages set by Dia-NN + :type missed_cleavages: int + :return: MTD sub-table + :rtype: pandas.core.frame.DataFrame + """ + FragmentMassTolerance = opms["FragmentMassTolerance"].values[0] FragmentMassToleranceUnit = opms["FragmentMassToleranceUnit"].values[0] PrecursorMassTolerance = opms["PrecursorMassTolerance"].values[0] @@ -263,6 +317,8 @@ def mztab_MTD(index_ref, opms, unimod_data, fasta, charge, missed_cleavages): out_mztab_MTD.loc[1, "study_variable[" + str(i) + "]-description"] = "no description given" out_mztab_MTD.loc[2, :] = "MTD" + + # Transpose out_mztab_MTD col = list(out_mztab_MTD.columns) row = list(out_mztab_MTD.index) out_mztab_MTD_T = pd.DataFrame(out_mztab_MTD.values.T, index = col, columns = row) @@ -277,6 +333,24 @@ def mztab_MTD(index_ref, opms, unimod_data, fasta, charge, missed_cleavages): def mztab_PRH(report, pg, unique, index_ref, database, fasta_pd): + """Construct PRH sub-table. + + :param report: Dataframe for Dia-NN main report + :type report: pandas.core.frame.DataFrame + :param pg: Dataframe for Dia-NN protein groups matrix + :type pg: pandas.core.frame.DataFrame + :param unique: Dataframe for Dia-NN unique genes matrix + :type unique: pandas.core.frame.DataFrame + :param index_ref: On the basis of f_table, two columns "ms_run" and "study_variable" are added for matching + :type indx_ref: pandas.core.frame.DataFrame + :param database: Path to fasta file + :type database: str + :param fasta_pd: A dataframe contains protein IDs, sequences and lengths + :type fasta_pd: pandas.core.frame.DataFrame + :return: PRH sub-table + :rtype: pandas.core.frame.DataFrame + """ + file = list(pg.columns[5:]) col = {} for i in file: @@ -309,7 +383,6 @@ def mztab_PRH(report, pg, unique, index_ref, database, fasta_pd): lambda x: find_modification(x["modifiedSequence"]), axis=1, result_type="expand") ## quantity at protein level: PG.MaxLFQ - ## protein_abundance_study_variable[n] max_study_variable = max(index_ref["study_variable"]) PRH_params = [] for i in range(1, max_study_variable + 1): @@ -317,19 +390,39 @@ def mztab_PRH(report, pg, unique, index_ref, database, fasta_pd): "protein_abundance_std_error_study_variable[" + str(i) + "]"]) out_mztab_PRH[PRH_params] = out_mztab_PRH.apply( - lambda x: match_in_report(report, x['accession'], max_study_variable, 0, 'protein'), axis=1, result_type='expand') + lambda x: match_in_report(report, x['accession'], max_study_variable, 1, 'protein'), axis=1, result_type='expand') out_mztab_PRH.loc[:, "opt_global_result_type"] = out_mztab_PRH.apply( lambda x: classify_protein(x["Genes"], unique["Genes"], "single_protein", "indistinguishable_protein_group"), axis=1, result_type="expand") out_mztab_PRH = out_mztab_PRH.drop(["Genes", "modifiedSequence"], axis=1) out_mztab_PRH.fillna("null", inplace=True) - out_mztab_PRH.to_csv("./out_protein.mztab", sep=",", index=False) + # out_mztab_PRH.to_csv("./out_protein.mztab", sep=",", index=False) return out_mztab_PRH def mztab_PEH(report, pr, unique, unimod_data, precursor_list, index_ref, database): + """Construct PEH sub-table. + + :param report: Dataframe for Dia-NN main report + :type report: pandas.core.frame.DataFrame + :param pr: Dataframe for Dia-NN precursors matrix + :type pr: pandas.core.frame.DataFrame + :param unique: Dataframe for Dia-NN unique genes matrix + :type unique: pandas.core.frame.DataFrame + :param unimod_data: An instance of class UnimodDatabase + :type unimod_data: sdrf_pipelines.openms.unimod.UnimodDatabase + :param precursor_list: A list contains all precursor IDs + :type precursor_list: list + :param index_ref: On the basis of f_table, two columns "ms_run" and "study_variable" are added for matching + :type indx_ref: pandas.core.frame.DataFrame + :param database: Path to fasta file + :type database: str + :return: PEH sub-table + :rtype: pandas.core.frame.DataFrame + """ + out_mztab_PEH = pd.DataFrame() out_mztab_PEH = pr.iloc[:, 0:10] out_mztab_PEH = out_mztab_PEH.drop(["Protein.Group", "Protein.Names", "First.Protein.Description", "Proteotypic"], axis = 1) @@ -373,23 +466,36 @@ def mztab_PEH(report, pr, unique, unimod_data, precursor_list, index_ref, databa out_mztab_PEH[PEH_params] = out_mztab_PEH.apply( lambda x: match_in_report(report, x['pr_id'], max_study_variable, 1, 'pep'), axis=1, result_type='expand') - ## ["Q.Value", "RT", "Global.Q.Value", "Lib.Q.Value", "Precursor.Mz"] out_mztab_PEH[["best_search_engine_score[1]", "retention_time", "opt_global_q-value", "opt_global_SpecEValue_score", "mass_to_charge" ]] = out_mztab_PEH.apply(lambda x: PEH_match_report(report, x['pr_id']), axis=1, result_type="expand") out_mztab_PEH[["opt_global_feature_id", "spectra_ref"]] = out_mztab_PEH.apply(lambda x:("null", "null"),axis = 1, result_type = "expand") out_mztab_PEH = out_mztab_PEH.drop(["Precursor.Id", "Genes", "pr_id"], axis = 1) out_mztab_PEH.fillna("null", inplace = True) - out_mztab_PEH.to_csv("./out_peptide.mztab", sep=",", index=False) + # out_mztab_PEH.to_csv("./out_peptide.mztab", sep=",", index=False) return out_mztab_PEH def mztab_PSH(unique, unimod_data, report, database): + """Construct PSH sub-table. + + :param unique: Dataframe for Dia-NN unique genes matrix + :type unique: pandas.core.frame.DataFrame + :param unimod_data: An instance of class UnimodDatabase + :type unimod_data: sdrf_pipelines.openms.unimod.UnimodDatabase + :param report: Dataframe for Dia-NN main report + :type report: pandas.core.frame.DataFrame + :param database: Path to fasta file + :type database: str + :return: PSH sub-table + :rtype: pandas.core.frame.DataFrame + """ + out_mztab_PSH = pd.DataFrame() ## Score at PSM level: Q.Value out_mztab_PSH = report[["Stripped.Sequence", "Protein.Ids", "Genes", "Q.Value", "RT", - "Precursor.Charge", "Precursor.Mz", "Modified.Sequence", "PEP", "Global.Q.Value", "Global.Q.Value"]] + "Precursor.Charge", "Calculate.Precursor.Mz", "Modified.Sequence", "PEP", "Global.Q.Value", "Global.Q.Value"]] out_mztab_PSH.columns = ["sequence", "accession", "Genes", "search_engine_score[1]", "retention_time", "charge", "calc_mass_to_charge", "opt_global_cv_MS:1000889_peptidoform_sequence", "opt_global_SpecEValue_score", "opt_global_q-value", "opt_global_q-value_score"] @@ -415,12 +521,22 @@ def mztab_PSH(unique, unimod_data, report, database): out_mztab_PSH = out_mztab_PSH.drop(["Genes"], axis = 1) out_mztab_PSH.fillna("null", inplace = True) - out_mztab_PSH.to_csv("./out_psms.mztab", sep=",", index=False) + # out_mztab_PSH.to_csv("./out_psms.mztab", sep=",", index=False) return out_mztab_PSH def add_info(target, index_ref): + """On the basis of f_table, two columns "ms_run" and "study_variable" are added for matching. + + :param target: The "Run" column in f_table + :type target: pandas.core.series.Series + :param index_ref: A dataframe on the basis of f_table + :type indx_ref: pandas.core.frame.DataFrame + :return: A tuple contains ms_run and study_variable + :rtype: tuple + """ + match = index_ref[index_ref["run"] == target] ms_run = match["ms_run"].values[0] study_variable = match["study_variable"].values[0] @@ -429,6 +545,20 @@ def add_info(target, index_ref): def classify_protein(target, unique, t, f): + """Proteins are classified according to whether the corresponding gene is unique or not. + + :param target: The "Genes" column in out_mztab_PSH + :type target: pandas.core.series.Series + :param unique: The "Genes" column in unique(Dataframe for Dia-NN unique genes matrix) + :type unique: pandas.core.series.Series + :param t: The signature of the unique gene + :type t: str + :param f: The signature of the NOT unique gene + :type f: str + :return: The signature of genes + :rtype: str + """ + if any(unique == target): return t else: @@ -436,6 +566,18 @@ def classify_protein(target, unique, t, f): def calculate_protein_coverage(report, target, fasta_pd): + """Calculate protein coverage. + + :param report: Dataframe for Dia-NN main report + :type report: pandas.core.frame.DataFrame + :param target: The "accession" column in out_mztab_PRH + :type target: pandas.core.series.Series + :param fasta_pd: A dataframe contains protein IDs, sequences and lengths + :type fasta_pd: pandas.core.frame.DataFrame + :return: Protein coverage + :rtype: str + """ + protein_coverage = "" len_current = len(max(report[report["Protein.Ids"] == target]["Stripped.Sequence"].values, key = len)) for i in target.split(";"): @@ -445,11 +587,27 @@ def calculate_protein_coverage(report, target, fasta_pd): return protein_coverage.strip(";") -def match_in_report(report, target, run, flag, level): +def match_in_report(report, target, max, flag, level): + """This function is used to match the columns "ms_run" and "study_variable" in the report to get the information. + + :param report: Dataframe for Dia-NN main report + :type report: pandas.core.frame.DataFrame + :param target: The "pr_id" column in out_mztab_PEH(level="peptide") or the "accession" column in out_mztab_PRH(level="protein") + :type target: pandas.core.series.Series + :param max: max_assay or max_study_variable + :type max: int + :param flag: Match the "study_variable" column(flag=1) or the "ms_run" column(flag=0) in the filter result + :type flag: int + :param level: "pep" or "protein" + :type level: str + :return: A tuple contains multiple messages + :rtype: tuple + """ + if flag == 1 and level == 'pep': result = report[report['precursor.Index'] == target] PEH_params = [] - for i in range(1, run + 1): + for i in range(1, max + 1): match = result[result['study_variable'] == i] PEH_params.extend([match['Precursor.Normalised'].mean(), 'null', 'null', 'null', match['RT'].mean()]) @@ -458,16 +616,16 @@ def match_in_report(report, target, run, flag, level): elif flag == 0 and level == 'pep': result = report[report['precursor.Index'] == target] q_value = [] - for i in range(1, run + 1): + for i in range(1, max + 1): match = result[result['ms_run'] == i] q_value.append(match['Q.Value'].values[0] if match['Q.Value'].values.size > 0 else np.nan) return tuple(q_value) - elif flag == 0 and level == 'protein': + elif flag == 1 and level == 'protein': result = report[report['Protein.Ids'] == target] PRH_params = [] - for i in range(1, run + 1): + for i in range(1, max + 1): match = result[result['study_variable'] == i] PRH_params.extend([match['PG.MaxLFQ'].mean(), 'null', 'null']) @@ -475,6 +633,16 @@ def match_in_report(report, target, run, flag, level): def PRH_match_report(report, target): + """Returns a tuple contains modified sequences and the score at protein level. + + :param report: Dataframe for Dia-NN main report + :type report: pandas.core.frame.DataFrame + :param target: The "accession" column in report + :type target: pandas.core.series.Series + :return: A tuple contains multiple information to construct PRH sub-table + :rtype: tuple + """ + match = report[report["Protein.Ids"] == target] modSeq = match["Modified.Sequence"].values[0] if match["Modified.Sequence"].values.size > 0 else np.nan ## Score at protein level: Global.PG.Q.Value (without MBR) @@ -484,18 +652,36 @@ def PRH_match_report(report, target): def PEH_match_report(report, target): + """Returns a tuple contains the score at peptide level, retain time, q_score, spec_e and mz. + + :param report: Dataframe for Dia-NN main report + :type report: pandas.core.frame.DataFrame + :param target: The "pr_id" column in report + :type target: pandas.core.series.Series + :return: A tuple contains multiple information to construct PEH sub-table + :rtype: tuple + """ + match = report[report["precursor.Index"] == target] ## Score at peptide level: the minimum of the respective precursor q-values (minimum of Q.Value per group) search_score = match["Q.Value"].min() time = match["RT"].mean() q_score = match["Global.Q.Value"].values[0] if match["Global.Q.Value"].values.size > 0 else np.nan spec_e = match["Lib.Q.Value"].values[0] if match["Lib.Q.Value"].values.size > 0 else np.nan - mz = match["Precursor.Mz"].mean() + mz = match["Calculate.Precursor.Mz"].mean() return search_score, time, q_score, spec_e, mz def find_modification(peptide): + """Identify the modification site based on the peptide containing modifications. + + :param peptide: Sequences of peptides + :type peptide: str + :return: Modification sites + :rtype: str + """ + pattern = re.compile(r"\((.*?)\)") original_mods = re.findall(pattern, peptide) peptide = re.sub('\(.*?\)','.',peptide) @@ -506,9 +692,9 @@ def find_modification(peptide): for k in range(0,len(original_mods)): original_mods[k] = str(position[k]) + "-" + original_mods[k].upper() - original_mods = ",".join(str(i) for i in original_mods) + original_mods = ",".join(str(i) for i in original_mods) if len(original_mods) > 0 else "null" - return original_mods if len(original_mods) > 0 else "null" + return original_mods cli.add_command(convert) From 430ae7968764480f02b2ae5de7a22fb65617fc3f Mon Sep 17 00:00:00 2001 From: WangHong007 <88552471+WangHong007@users.noreply.github.com> Date: Mon, 4 Jul 2022 17:02:40 +0800 Subject: [PATCH 33/47] abandon "openms.tsv" --- bin/diann_convert.py | 53 ++++++++-------------- modules/local/diannconvert/main.nf | 22 +++++---- subworkflows/local/create_input_channel.nf | 2 +- workflows/dia.nf | 4 +- workflows/quantms.nf | 2 +- 5 files changed, 34 insertions(+), 49 deletions(-) diff --git a/bin/diann_convert.py b/bin/diann_convert.py index 472a573f..2509f669 100755 --- a/bin/diann_convert.py +++ b/bin/diann_convert.py @@ -20,14 +20,14 @@ def cli(): @click.option("--pg_matrix", "-pg") @click.option("--pr_matrix", "-pr") @click.option("--unique_matrix", "-un") -@click.option("--openms", "-o") +@click.option("--dia_params", "-p") @click.option("--fasta", "-f") @click.option("--charge", "-c") @click.option("--missed_cleavages", "-m") @click.option("--qvalue_threshold", "-q", type=float) @click.pass_context -def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, openms, fasta, charge, missed_cleavages, qvalue_threshold): +def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, dia_params, fasta, charge, missed_cleavages, qvalue_threshold): """This function is designed to convert the DIA-NN output into three standard formats: MSstats, Triqler and mzTab. These documents are used for quality control and downstream analysis. @@ -41,8 +41,8 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, :type pr_matrix: str :param unique_matrix: Path to a DIA-NN matrix file containing unique genes :type unique_matrix: str - :param openms: Path to OpenMS, one of the workflow configuration files - :type openms: str + :param dia_params: A list contains DIA parameters + :type dia_params: list :param fasta: Path to the fasta file :type fasta: str :param charge: The charge assigned by DIA-NN(max_precursor_charge) @@ -52,12 +52,9 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, :param qvalue_threshold: Threshold for filtering q value :type qvalue_threshold: float """ - pg = pd.read_csv(pg_matrix, sep = "\t", header = 0, dtype = "str") pr = pd.read_csv(pr_matrix, sep = "\t", header = 0, dtype = "str") unique = pd.read_csv(unique_matrix, sep = "\t", header = 0, dtype = "str") - opms = pd.read_csv(openms, sep = "\t", header = 0, dtype = "str") - opms.fillna("null", inplace=True) report = pd.read_csv(diann_report, sep = "\t", header = 0, dtype = "str") report["Calculate.Precursor.Mz"] = report.apply(lambda x: molecular_weight(x["Stripped.Sequence"], 'protein', monoisotopic=True) / int(x["Precursor.Charge"]), axis=1) precursor_list = list(report["Precursor.Id"].unique()) @@ -125,7 +122,7 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, index_ref.loc[:, "study_variable"] = index_ref.loc[:, "study_variable"].astype("int") report[["ms_run", "study_variable"]] = report.apply(lambda x: add_info(x["Run"], index_ref), axis = 1, result_type = "expand") - (MTD, database) = mztab_MTD(index_ref, opms, unimod_data, fasta, charge, missed_cleavages) + (MTD, database) = mztab_MTD(index_ref, dia_params, unimod_data, fasta, charge, missed_cleavages) PRH = mztab_PRH(report, pg, unique, index_ref, database, fasta_pd) PEH = mztab_PEH(report, pr, unique, unimod_data, precursor_list, index_ref, database) PSH = mztab_PSH(unique, unimod_data, report, database) @@ -152,7 +149,6 @@ def query_expdesign_value(reference, f_table, s_table): :return: A tuple contains Fraction, BioReplicate and Condition :rtype: tuple """ - query_reference = f_table[f_table["run"] == reference] Fraction = query_reference["Fraction"].values[0] row = s_table[s_table["Sample"] == query_reference["Sample"].values[0]] @@ -172,7 +168,6 @@ def convert_modification(peptide, unimod_data): :return: Peptide contains the modification names :rtype: str """ - pattern = re.compile(r"\((.*?)\)") origianl_mods = re.findall(pattern, peptide) for mod in set(origianl_mods): @@ -188,14 +183,13 @@ def MTD_mod_info(unimod_database, fix_mod, var_mod): :param unimod_database: An instance of class UnimodDatabase :type unimod_database: sdrf_pipelines.openms.unimod.UnimodDatabase - :param fix_mod: Fixed modifications from openms.tsv + :param fix_mod: Fixed modifications from DIA parameter list :type fix_mod: str - :param var_mod: Variable modifications from openms.tsv + :param var_mod: Variable modifications from DIA parameter list :type var_mod: str :return: A tuple contains fixed and variable modifications, and flags indicating whether they are null :rtype: tuple """ - pattern = re.compile("\((.*?)\)") var_ptm = [] fix_ptm = [] @@ -231,13 +225,13 @@ def MTD_mod_info(unimod_database, fix_mod, var_mod): return fix_ptm, var_ptm, fix_flag, var_flag -def mztab_MTD(index_ref, opms, unimod_data, fasta, charge, missed_cleavages): +def mztab_MTD(index_ref, dia_params, unimod_data, fasta, charge, missed_cleavages): """Construct MTD sub-table. :param index_ref: On the basis of f_table, two columns "MS_run" and "study_variable" are added for matching :type indx_ref: pandas.core.frame.DataFrame - :param opms: Dataframe for openms.tsv - :type opms: pandas.core.frame.DataFrame + :param dia_params: A list contains DIA parameters + :type dia_params: list :param unimod_data: An instance of class UnimodDatabase :type unimod_data: sdrf_pipelines.openms.unimod.UnimodDatabase :param fasta: Fasta file path @@ -249,15 +243,14 @@ def mztab_MTD(index_ref, opms, unimod_data, fasta, charge, missed_cleavages): :return: MTD sub-table :rtype: pandas.core.frame.DataFrame """ - - FragmentMassTolerance = opms["FragmentMassTolerance"].values[0] - FragmentMassToleranceUnit = opms["FragmentMassToleranceUnit"].values[0] - PrecursorMassTolerance = opms["PrecursorMassTolerance"].values[0] - PrecursorMassToleranceUnit = opms["PrecursorMassToleranceUnit"].values[0] - Enzyme = opms["Enzyme"].values[0] - FixedModifications = opms["FixedModifications"].values[0] - VariableModifications = opms["VariableModifications"].values[0] - + dia_params_list = dia_params.split(";") + FragmentMassTolerance = dia_params_list[0] + FragmentMassToleranceUnit = dia_params_list[1] + PrecursorMassTolerance = dia_params_list[2] + PrecursorMassToleranceUnit = dia_params_list[3] + Enzyme = dia_params_list[4] + FixedModifications = dia_params_list[5] + VariableModifications = dia_params_list[6] out_mztab_MTD = pd.DataFrame() out_mztab_MTD.loc[1, "mzTab-version"] = "1.0.0" out_mztab_MTD.loc[1, "mzTab-mode"] = "Summary" @@ -350,7 +343,6 @@ def mztab_PRH(report, pg, unique, index_ref, database, fasta_pd): :return: PRH sub-table :rtype: pandas.core.frame.DataFrame """ - file = list(pg.columns[5:]) col = {} for i in file: @@ -422,7 +414,6 @@ def mztab_PEH(report, pr, unique, unimod_data, precursor_list, index_ref, databa :return: PEH sub-table :rtype: pandas.core.frame.DataFrame """ - out_mztab_PEH = pd.DataFrame() out_mztab_PEH = pr.iloc[:, 0:10] out_mztab_PEH = out_mztab_PEH.drop(["Protein.Group", "Protein.Names", "First.Protein.Description", "Proteotypic"], axis = 1) @@ -491,7 +482,6 @@ def mztab_PSH(unique, unimod_data, report, database): :return: PSH sub-table :rtype: pandas.core.frame.DataFrame """ - out_mztab_PSH = pd.DataFrame() ## Score at PSM level: Q.Value out_mztab_PSH = report[["Stripped.Sequence", "Protein.Ids", "Genes", "Q.Value", "RT", @@ -536,7 +526,6 @@ def add_info(target, index_ref): :return: A tuple contains ms_run and study_variable :rtype: tuple """ - match = index_ref[index_ref["run"] == target] ms_run = match["ms_run"].values[0] study_variable = match["study_variable"].values[0] @@ -558,7 +547,6 @@ def classify_protein(target, unique, t, f): :return: The signature of genes :rtype: str """ - if any(unique == target): return t else: @@ -577,7 +565,6 @@ def calculate_protein_coverage(report, target, fasta_pd): :return: Protein coverage :rtype: str """ - protein_coverage = "" len_current = len(max(report[report["Protein.Ids"] == target]["Stripped.Sequence"].values, key = len)) for i in target.split(";"): @@ -603,7 +590,6 @@ def match_in_report(report, target, max, flag, level): :return: A tuple contains multiple messages :rtype: tuple """ - if flag == 1 and level == 'pep': result = report[report['precursor.Index'] == target] PEH_params = [] @@ -642,7 +628,6 @@ def PRH_match_report(report, target): :return: A tuple contains multiple information to construct PRH sub-table :rtype: tuple """ - match = report[report["Protein.Ids"] == target] modSeq = match["Modified.Sequence"].values[0] if match["Modified.Sequence"].values.size > 0 else np.nan ## Score at protein level: Global.PG.Q.Value (without MBR) @@ -661,7 +646,6 @@ def PEH_match_report(report, target): :return: A tuple contains multiple information to construct PEH sub-table :rtype: tuple """ - match = report[report["precursor.Index"] == target] ## Score at peptide level: the minimum of the respective precursor q-values (minimum of Q.Value per group) search_score = match["Q.Value"].min() @@ -681,7 +665,6 @@ def find_modification(peptide): :return: Modification sites :rtype: str """ - pattern = re.compile(r"\((.*?)\)") original_mods = re.findall(pattern, peptide) peptide = re.sub('\(.*?\)','.',peptide) diff --git a/modules/local/diannconvert/main.nf b/modules/local/diannconvert/main.nf index 225e114c..4b8406ed 100644 --- a/modules/local/diannconvert/main.nf +++ b/modules/local/diannconvert/main.nf @@ -15,7 +15,7 @@ process DIANNCONVERT { path(report_pg) path(report_pr) path(report_unique_gene) - path(openms) + val(meta) path(fasta) val(charge) val(missed_cleavages) @@ -23,24 +23,26 @@ process DIANNCONVERT { output: path "*msstats_in.csv", emit: out_msstats path "*triqler_in.tsv", emit: out_triqler - path "*out.mztab", emit: out_mztab + path "*.mztab", emit: out_mztab path "versions.yml", emit: version script: def args = task.ext.args ?: '' + def dia_params = [meta.fragmentmasstolerance, meta.fragmentmasstoleranceunit, meta.precursormasstolerance, + meta.precursormasstoleranceunit, meta.enzyme, meta.fixedmodifications, meta.variablemodifications].join(';') """ diann_convert.py convert \\ - --diann_report ${report} \\ - --exp_design ${exp_design} \\ - --pg_matrix ${report_pg} \\ - --pr_matrix ${report_pr} \\ - --unique_matrix ${report_unique_gene} \\ - --openms ${openms} \\ - --fasta ${fasta} \\ + --diann_report "${report}" \\ + --exp_design "${exp_design}" \\ + --pg_matrix "${report_pg}" \\ + --pr_matrix "${report_pr}" \\ + --unique_matrix "${report_unique_gene}" \\ + --dia_params "${dia_params}" \\ + --fasta "${fasta}" \\ --charge ${charge} \\ --missed_cleavages ${missed_cleavages} \\ - --qvalue_threshold $params.protein_level_fdr_cutoff \\ + --qvalue_threshold ${params.protein_level_fdr_cutoff} \\ |& tee convert_report.log cat <<-END_VERSIONS > versions.yml diff --git a/subworkflows/local/create_input_channel.nf b/subworkflows/local/create_input_channel.nf index 8269ee66..e0d624dd 100644 --- a/subworkflows/local/create_input_channel.nf +++ b/subworkflows/local/create_input_channel.nf @@ -54,7 +54,7 @@ workflow CREATE_INPUT_CHANNEL { ch_meta_config_iso // [meta, [spectra_files ]] ch_meta_config_lfq // [meta, [spectra_files ]] ch_meta_config_dia // [meta, [spectra files ]] - ch_config + ch_expdesign version = ch_versions diff --git a/workflows/dia.nf b/workflows/dia.nf index 00cabbab..a290d4f9 100644 --- a/workflows/dia.nf +++ b/workflows/dia.nf @@ -34,7 +34,6 @@ workflow DIA { take: ch_file_preparation_results ch_expdesign - ch_config main: @@ -86,7 +85,8 @@ workflow DIA { // // MODULE: DIANNCONVERT // - DIANNCONVERT(DIANNSUMMARY.out.main_report, ch_expdesign, DIANNSUMMARY.out.pg_matrix, DIANNSUMMARY.out.pr_matrix, DIANNSUMMARY.out.unique_gene_matrix, ch_config, params.database, params.max_precursor_charge, params.allowed_missed_cleavages) + DIANNCONVERT(DIANNSUMMARY.out.main_report, ch_expdesign, DIANNSUMMARY.out.pg_matrix, DIANNSUMMARY.out.pr_matrix, DIANNSUMMARY.out.unique_gene_matrix, + ch_result.meta.first(), params.database, params.max_precursor_charge, params.allowed_missed_cleavages) ch_software_versions = ch_software_versions.mix(DIANNCONVERT.out.version.ifEmpty(null)) // diff --git a/workflows/quantms.nf b/workflows/quantms.nf index 529ba9b0..0970cd88 100644 --- a/workflows/quantms.nf +++ b/workflows/quantms.nf @@ -151,7 +151,7 @@ workflow QUANTMS { ch_msstats_in = ch_msstats_in.mix(LFQ.out.msstats_in) ch_versions = ch_versions.mix(LFQ.out.versions.ifEmpty(null)) - DIA(ch_fileprep_result.dia, CREATE_INPUT_CHANNEL.out.ch_expdesign, CREATE_INPUT_CHANNEL.out.ch_config) + DIA(ch_fileprep_result.dia, CREATE_INPUT_CHANNEL.out.ch_expdesign) ch_pipeline_results = ch_pipeline_results.mix(DIA.out.diann_report) ch_msstats_in = ch_msstats_in.mix(DIA.out.msstats_in) ch_versions = ch_versions.mix(DIA.out.versions.ifEmpty(null)) From 7a76203061357f6117b55032ade09b947cb726bb Mon Sep 17 00:00:00 2001 From: WangHong007 <88552471+WangHong007@users.noreply.github.com> Date: Mon, 4 Jul 2022 17:10:09 +0800 Subject: [PATCH 34/47] Revert "abandon "openms.tsv"" This reverts commit 430ae7968764480f02b2ae5de7a22fb65617fc3f. --- bin/diann_convert.py | 53 ++++++++++++++-------- modules/local/diannconvert/main.nf | 22 ++++----- subworkflows/local/create_input_channel.nf | 2 +- workflows/dia.nf | 4 +- workflows/quantms.nf | 2 +- 5 files changed, 49 insertions(+), 34 deletions(-) diff --git a/bin/diann_convert.py b/bin/diann_convert.py index 2509f669..472a573f 100755 --- a/bin/diann_convert.py +++ b/bin/diann_convert.py @@ -20,14 +20,14 @@ def cli(): @click.option("--pg_matrix", "-pg") @click.option("--pr_matrix", "-pr") @click.option("--unique_matrix", "-un") -@click.option("--dia_params", "-p") +@click.option("--openms", "-o") @click.option("--fasta", "-f") @click.option("--charge", "-c") @click.option("--missed_cleavages", "-m") @click.option("--qvalue_threshold", "-q", type=float) @click.pass_context -def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, dia_params, fasta, charge, missed_cleavages, qvalue_threshold): +def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, openms, fasta, charge, missed_cleavages, qvalue_threshold): """This function is designed to convert the DIA-NN output into three standard formats: MSstats, Triqler and mzTab. These documents are used for quality control and downstream analysis. @@ -41,8 +41,8 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, :type pr_matrix: str :param unique_matrix: Path to a DIA-NN matrix file containing unique genes :type unique_matrix: str - :param dia_params: A list contains DIA parameters - :type dia_params: list + :param openms: Path to OpenMS, one of the workflow configuration files + :type openms: str :param fasta: Path to the fasta file :type fasta: str :param charge: The charge assigned by DIA-NN(max_precursor_charge) @@ -52,9 +52,12 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, :param qvalue_threshold: Threshold for filtering q value :type qvalue_threshold: float """ + pg = pd.read_csv(pg_matrix, sep = "\t", header = 0, dtype = "str") pr = pd.read_csv(pr_matrix, sep = "\t", header = 0, dtype = "str") unique = pd.read_csv(unique_matrix, sep = "\t", header = 0, dtype = "str") + opms = pd.read_csv(openms, sep = "\t", header = 0, dtype = "str") + opms.fillna("null", inplace=True) report = pd.read_csv(diann_report, sep = "\t", header = 0, dtype = "str") report["Calculate.Precursor.Mz"] = report.apply(lambda x: molecular_weight(x["Stripped.Sequence"], 'protein', monoisotopic=True) / int(x["Precursor.Charge"]), axis=1) precursor_list = list(report["Precursor.Id"].unique()) @@ -122,7 +125,7 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, index_ref.loc[:, "study_variable"] = index_ref.loc[:, "study_variable"].astype("int") report[["ms_run", "study_variable"]] = report.apply(lambda x: add_info(x["Run"], index_ref), axis = 1, result_type = "expand") - (MTD, database) = mztab_MTD(index_ref, dia_params, unimod_data, fasta, charge, missed_cleavages) + (MTD, database) = mztab_MTD(index_ref, opms, unimod_data, fasta, charge, missed_cleavages) PRH = mztab_PRH(report, pg, unique, index_ref, database, fasta_pd) PEH = mztab_PEH(report, pr, unique, unimod_data, precursor_list, index_ref, database) PSH = mztab_PSH(unique, unimod_data, report, database) @@ -149,6 +152,7 @@ def query_expdesign_value(reference, f_table, s_table): :return: A tuple contains Fraction, BioReplicate and Condition :rtype: tuple """ + query_reference = f_table[f_table["run"] == reference] Fraction = query_reference["Fraction"].values[0] row = s_table[s_table["Sample"] == query_reference["Sample"].values[0]] @@ -168,6 +172,7 @@ def convert_modification(peptide, unimod_data): :return: Peptide contains the modification names :rtype: str """ + pattern = re.compile(r"\((.*?)\)") origianl_mods = re.findall(pattern, peptide) for mod in set(origianl_mods): @@ -183,13 +188,14 @@ def MTD_mod_info(unimod_database, fix_mod, var_mod): :param unimod_database: An instance of class UnimodDatabase :type unimod_database: sdrf_pipelines.openms.unimod.UnimodDatabase - :param fix_mod: Fixed modifications from DIA parameter list + :param fix_mod: Fixed modifications from openms.tsv :type fix_mod: str - :param var_mod: Variable modifications from DIA parameter list + :param var_mod: Variable modifications from openms.tsv :type var_mod: str :return: A tuple contains fixed and variable modifications, and flags indicating whether they are null :rtype: tuple """ + pattern = re.compile("\((.*?)\)") var_ptm = [] fix_ptm = [] @@ -225,13 +231,13 @@ def MTD_mod_info(unimod_database, fix_mod, var_mod): return fix_ptm, var_ptm, fix_flag, var_flag -def mztab_MTD(index_ref, dia_params, unimod_data, fasta, charge, missed_cleavages): +def mztab_MTD(index_ref, opms, unimod_data, fasta, charge, missed_cleavages): """Construct MTD sub-table. :param index_ref: On the basis of f_table, two columns "MS_run" and "study_variable" are added for matching :type indx_ref: pandas.core.frame.DataFrame - :param dia_params: A list contains DIA parameters - :type dia_params: list + :param opms: Dataframe for openms.tsv + :type opms: pandas.core.frame.DataFrame :param unimod_data: An instance of class UnimodDatabase :type unimod_data: sdrf_pipelines.openms.unimod.UnimodDatabase :param fasta: Fasta file path @@ -243,14 +249,15 @@ def mztab_MTD(index_ref, dia_params, unimod_data, fasta, charge, missed_cleavage :return: MTD sub-table :rtype: pandas.core.frame.DataFrame """ - dia_params_list = dia_params.split(";") - FragmentMassTolerance = dia_params_list[0] - FragmentMassToleranceUnit = dia_params_list[1] - PrecursorMassTolerance = dia_params_list[2] - PrecursorMassToleranceUnit = dia_params_list[3] - Enzyme = dia_params_list[4] - FixedModifications = dia_params_list[5] - VariableModifications = dia_params_list[6] + + FragmentMassTolerance = opms["FragmentMassTolerance"].values[0] + FragmentMassToleranceUnit = opms["FragmentMassToleranceUnit"].values[0] + PrecursorMassTolerance = opms["PrecursorMassTolerance"].values[0] + PrecursorMassToleranceUnit = opms["PrecursorMassToleranceUnit"].values[0] + Enzyme = opms["Enzyme"].values[0] + FixedModifications = opms["FixedModifications"].values[0] + VariableModifications = opms["VariableModifications"].values[0] + out_mztab_MTD = pd.DataFrame() out_mztab_MTD.loc[1, "mzTab-version"] = "1.0.0" out_mztab_MTD.loc[1, "mzTab-mode"] = "Summary" @@ -343,6 +350,7 @@ def mztab_PRH(report, pg, unique, index_ref, database, fasta_pd): :return: PRH sub-table :rtype: pandas.core.frame.DataFrame """ + file = list(pg.columns[5:]) col = {} for i in file: @@ -414,6 +422,7 @@ def mztab_PEH(report, pr, unique, unimod_data, precursor_list, index_ref, databa :return: PEH sub-table :rtype: pandas.core.frame.DataFrame """ + out_mztab_PEH = pd.DataFrame() out_mztab_PEH = pr.iloc[:, 0:10] out_mztab_PEH = out_mztab_PEH.drop(["Protein.Group", "Protein.Names", "First.Protein.Description", "Proteotypic"], axis = 1) @@ -482,6 +491,7 @@ def mztab_PSH(unique, unimod_data, report, database): :return: PSH sub-table :rtype: pandas.core.frame.DataFrame """ + out_mztab_PSH = pd.DataFrame() ## Score at PSM level: Q.Value out_mztab_PSH = report[["Stripped.Sequence", "Protein.Ids", "Genes", "Q.Value", "RT", @@ -526,6 +536,7 @@ def add_info(target, index_ref): :return: A tuple contains ms_run and study_variable :rtype: tuple """ + match = index_ref[index_ref["run"] == target] ms_run = match["ms_run"].values[0] study_variable = match["study_variable"].values[0] @@ -547,6 +558,7 @@ def classify_protein(target, unique, t, f): :return: The signature of genes :rtype: str """ + if any(unique == target): return t else: @@ -565,6 +577,7 @@ def calculate_protein_coverage(report, target, fasta_pd): :return: Protein coverage :rtype: str """ + protein_coverage = "" len_current = len(max(report[report["Protein.Ids"] == target]["Stripped.Sequence"].values, key = len)) for i in target.split(";"): @@ -590,6 +603,7 @@ def match_in_report(report, target, max, flag, level): :return: A tuple contains multiple messages :rtype: tuple """ + if flag == 1 and level == 'pep': result = report[report['precursor.Index'] == target] PEH_params = [] @@ -628,6 +642,7 @@ def PRH_match_report(report, target): :return: A tuple contains multiple information to construct PRH sub-table :rtype: tuple """ + match = report[report["Protein.Ids"] == target] modSeq = match["Modified.Sequence"].values[0] if match["Modified.Sequence"].values.size > 0 else np.nan ## Score at protein level: Global.PG.Q.Value (without MBR) @@ -646,6 +661,7 @@ def PEH_match_report(report, target): :return: A tuple contains multiple information to construct PEH sub-table :rtype: tuple """ + match = report[report["precursor.Index"] == target] ## Score at peptide level: the minimum of the respective precursor q-values (minimum of Q.Value per group) search_score = match["Q.Value"].min() @@ -665,6 +681,7 @@ def find_modification(peptide): :return: Modification sites :rtype: str """ + pattern = re.compile(r"\((.*?)\)") original_mods = re.findall(pattern, peptide) peptide = re.sub('\(.*?\)','.',peptide) diff --git a/modules/local/diannconvert/main.nf b/modules/local/diannconvert/main.nf index 4b8406ed..225e114c 100644 --- a/modules/local/diannconvert/main.nf +++ b/modules/local/diannconvert/main.nf @@ -15,7 +15,7 @@ process DIANNCONVERT { path(report_pg) path(report_pr) path(report_unique_gene) - val(meta) + path(openms) path(fasta) val(charge) val(missed_cleavages) @@ -23,26 +23,24 @@ process DIANNCONVERT { output: path "*msstats_in.csv", emit: out_msstats path "*triqler_in.tsv", emit: out_triqler - path "*.mztab", emit: out_mztab + path "*out.mztab", emit: out_mztab path "versions.yml", emit: version script: def args = task.ext.args ?: '' - def dia_params = [meta.fragmentmasstolerance, meta.fragmentmasstoleranceunit, meta.precursormasstolerance, - meta.precursormasstoleranceunit, meta.enzyme, meta.fixedmodifications, meta.variablemodifications].join(';') """ diann_convert.py convert \\ - --diann_report "${report}" \\ - --exp_design "${exp_design}" \\ - --pg_matrix "${report_pg}" \\ - --pr_matrix "${report_pr}" \\ - --unique_matrix "${report_unique_gene}" \\ - --dia_params "${dia_params}" \\ - --fasta "${fasta}" \\ + --diann_report ${report} \\ + --exp_design ${exp_design} \\ + --pg_matrix ${report_pg} \\ + --pr_matrix ${report_pr} \\ + --unique_matrix ${report_unique_gene} \\ + --openms ${openms} \\ + --fasta ${fasta} \\ --charge ${charge} \\ --missed_cleavages ${missed_cleavages} \\ - --qvalue_threshold ${params.protein_level_fdr_cutoff} \\ + --qvalue_threshold $params.protein_level_fdr_cutoff \\ |& tee convert_report.log cat <<-END_VERSIONS > versions.yml diff --git a/subworkflows/local/create_input_channel.nf b/subworkflows/local/create_input_channel.nf index e0d624dd..8269ee66 100644 --- a/subworkflows/local/create_input_channel.nf +++ b/subworkflows/local/create_input_channel.nf @@ -54,7 +54,7 @@ workflow CREATE_INPUT_CHANNEL { ch_meta_config_iso // [meta, [spectra_files ]] ch_meta_config_lfq // [meta, [spectra_files ]] ch_meta_config_dia // [meta, [spectra files ]] - + ch_config ch_expdesign version = ch_versions diff --git a/workflows/dia.nf b/workflows/dia.nf index a290d4f9..00cabbab 100644 --- a/workflows/dia.nf +++ b/workflows/dia.nf @@ -34,6 +34,7 @@ workflow DIA { take: ch_file_preparation_results ch_expdesign + ch_config main: @@ -85,8 +86,7 @@ workflow DIA { // // MODULE: DIANNCONVERT // - DIANNCONVERT(DIANNSUMMARY.out.main_report, ch_expdesign, DIANNSUMMARY.out.pg_matrix, DIANNSUMMARY.out.pr_matrix, DIANNSUMMARY.out.unique_gene_matrix, - ch_result.meta.first(), params.database, params.max_precursor_charge, params.allowed_missed_cleavages) + DIANNCONVERT(DIANNSUMMARY.out.main_report, ch_expdesign, DIANNSUMMARY.out.pg_matrix, DIANNSUMMARY.out.pr_matrix, DIANNSUMMARY.out.unique_gene_matrix, ch_config, params.database, params.max_precursor_charge, params.allowed_missed_cleavages) ch_software_versions = ch_software_versions.mix(DIANNCONVERT.out.version.ifEmpty(null)) // diff --git a/workflows/quantms.nf b/workflows/quantms.nf index 0970cd88..529ba9b0 100644 --- a/workflows/quantms.nf +++ b/workflows/quantms.nf @@ -151,7 +151,7 @@ workflow QUANTMS { ch_msstats_in = ch_msstats_in.mix(LFQ.out.msstats_in) ch_versions = ch_versions.mix(LFQ.out.versions.ifEmpty(null)) - DIA(ch_fileprep_result.dia, CREATE_INPUT_CHANNEL.out.ch_expdesign) + DIA(ch_fileprep_result.dia, CREATE_INPUT_CHANNEL.out.ch_expdesign, CREATE_INPUT_CHANNEL.out.ch_config) ch_pipeline_results = ch_pipeline_results.mix(DIA.out.diann_report) ch_msstats_in = ch_msstats_in.mix(DIA.out.msstats_in) ch_versions = ch_versions.mix(DIA.out.versions.ifEmpty(null)) From 2be46e606a5f808d8bdfecef20821af4960a645e Mon Sep 17 00:00:00 2001 From: WangHong007 <88552471+WangHong007@users.noreply.github.com> Date: Mon, 4 Jul 2022 17:15:59 +0800 Subject: [PATCH 35/47] abandon "openms.tsv" --- bin/diann_convert.py | 53 ++++++++-------------- modules/local/diannconvert/main.nf | 22 +++++---- subworkflows/local/create_input_channel.nf | 1 - workflows/dia.nf | 4 +- workflows/quantms.nf | 2 +- 5 files changed, 33 insertions(+), 49 deletions(-) diff --git a/bin/diann_convert.py b/bin/diann_convert.py index 472a573f..2509f669 100755 --- a/bin/diann_convert.py +++ b/bin/diann_convert.py @@ -20,14 +20,14 @@ def cli(): @click.option("--pg_matrix", "-pg") @click.option("--pr_matrix", "-pr") @click.option("--unique_matrix", "-un") -@click.option("--openms", "-o") +@click.option("--dia_params", "-p") @click.option("--fasta", "-f") @click.option("--charge", "-c") @click.option("--missed_cleavages", "-m") @click.option("--qvalue_threshold", "-q", type=float) @click.pass_context -def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, openms, fasta, charge, missed_cleavages, qvalue_threshold): +def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, dia_params, fasta, charge, missed_cleavages, qvalue_threshold): """This function is designed to convert the DIA-NN output into three standard formats: MSstats, Triqler and mzTab. These documents are used for quality control and downstream analysis. @@ -41,8 +41,8 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, :type pr_matrix: str :param unique_matrix: Path to a DIA-NN matrix file containing unique genes :type unique_matrix: str - :param openms: Path to OpenMS, one of the workflow configuration files - :type openms: str + :param dia_params: A list contains DIA parameters + :type dia_params: list :param fasta: Path to the fasta file :type fasta: str :param charge: The charge assigned by DIA-NN(max_precursor_charge) @@ -52,12 +52,9 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, :param qvalue_threshold: Threshold for filtering q value :type qvalue_threshold: float """ - pg = pd.read_csv(pg_matrix, sep = "\t", header = 0, dtype = "str") pr = pd.read_csv(pr_matrix, sep = "\t", header = 0, dtype = "str") unique = pd.read_csv(unique_matrix, sep = "\t", header = 0, dtype = "str") - opms = pd.read_csv(openms, sep = "\t", header = 0, dtype = "str") - opms.fillna("null", inplace=True) report = pd.read_csv(diann_report, sep = "\t", header = 0, dtype = "str") report["Calculate.Precursor.Mz"] = report.apply(lambda x: molecular_weight(x["Stripped.Sequence"], 'protein', monoisotopic=True) / int(x["Precursor.Charge"]), axis=1) precursor_list = list(report["Precursor.Id"].unique()) @@ -125,7 +122,7 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, index_ref.loc[:, "study_variable"] = index_ref.loc[:, "study_variable"].astype("int") report[["ms_run", "study_variable"]] = report.apply(lambda x: add_info(x["Run"], index_ref), axis = 1, result_type = "expand") - (MTD, database) = mztab_MTD(index_ref, opms, unimod_data, fasta, charge, missed_cleavages) + (MTD, database) = mztab_MTD(index_ref, dia_params, unimod_data, fasta, charge, missed_cleavages) PRH = mztab_PRH(report, pg, unique, index_ref, database, fasta_pd) PEH = mztab_PEH(report, pr, unique, unimod_data, precursor_list, index_ref, database) PSH = mztab_PSH(unique, unimod_data, report, database) @@ -152,7 +149,6 @@ def query_expdesign_value(reference, f_table, s_table): :return: A tuple contains Fraction, BioReplicate and Condition :rtype: tuple """ - query_reference = f_table[f_table["run"] == reference] Fraction = query_reference["Fraction"].values[0] row = s_table[s_table["Sample"] == query_reference["Sample"].values[0]] @@ -172,7 +168,6 @@ def convert_modification(peptide, unimod_data): :return: Peptide contains the modification names :rtype: str """ - pattern = re.compile(r"\((.*?)\)") origianl_mods = re.findall(pattern, peptide) for mod in set(origianl_mods): @@ -188,14 +183,13 @@ def MTD_mod_info(unimod_database, fix_mod, var_mod): :param unimod_database: An instance of class UnimodDatabase :type unimod_database: sdrf_pipelines.openms.unimod.UnimodDatabase - :param fix_mod: Fixed modifications from openms.tsv + :param fix_mod: Fixed modifications from DIA parameter list :type fix_mod: str - :param var_mod: Variable modifications from openms.tsv + :param var_mod: Variable modifications from DIA parameter list :type var_mod: str :return: A tuple contains fixed and variable modifications, and flags indicating whether they are null :rtype: tuple """ - pattern = re.compile("\((.*?)\)") var_ptm = [] fix_ptm = [] @@ -231,13 +225,13 @@ def MTD_mod_info(unimod_database, fix_mod, var_mod): return fix_ptm, var_ptm, fix_flag, var_flag -def mztab_MTD(index_ref, opms, unimod_data, fasta, charge, missed_cleavages): +def mztab_MTD(index_ref, dia_params, unimod_data, fasta, charge, missed_cleavages): """Construct MTD sub-table. :param index_ref: On the basis of f_table, two columns "MS_run" and "study_variable" are added for matching :type indx_ref: pandas.core.frame.DataFrame - :param opms: Dataframe for openms.tsv - :type opms: pandas.core.frame.DataFrame + :param dia_params: A list contains DIA parameters + :type dia_params: list :param unimod_data: An instance of class UnimodDatabase :type unimod_data: sdrf_pipelines.openms.unimod.UnimodDatabase :param fasta: Fasta file path @@ -249,15 +243,14 @@ def mztab_MTD(index_ref, opms, unimod_data, fasta, charge, missed_cleavages): :return: MTD sub-table :rtype: pandas.core.frame.DataFrame """ - - FragmentMassTolerance = opms["FragmentMassTolerance"].values[0] - FragmentMassToleranceUnit = opms["FragmentMassToleranceUnit"].values[0] - PrecursorMassTolerance = opms["PrecursorMassTolerance"].values[0] - PrecursorMassToleranceUnit = opms["PrecursorMassToleranceUnit"].values[0] - Enzyme = opms["Enzyme"].values[0] - FixedModifications = opms["FixedModifications"].values[0] - VariableModifications = opms["VariableModifications"].values[0] - + dia_params_list = dia_params.split(";") + FragmentMassTolerance = dia_params_list[0] + FragmentMassToleranceUnit = dia_params_list[1] + PrecursorMassTolerance = dia_params_list[2] + PrecursorMassToleranceUnit = dia_params_list[3] + Enzyme = dia_params_list[4] + FixedModifications = dia_params_list[5] + VariableModifications = dia_params_list[6] out_mztab_MTD = pd.DataFrame() out_mztab_MTD.loc[1, "mzTab-version"] = "1.0.0" out_mztab_MTD.loc[1, "mzTab-mode"] = "Summary" @@ -350,7 +343,6 @@ def mztab_PRH(report, pg, unique, index_ref, database, fasta_pd): :return: PRH sub-table :rtype: pandas.core.frame.DataFrame """ - file = list(pg.columns[5:]) col = {} for i in file: @@ -422,7 +414,6 @@ def mztab_PEH(report, pr, unique, unimod_data, precursor_list, index_ref, databa :return: PEH sub-table :rtype: pandas.core.frame.DataFrame """ - out_mztab_PEH = pd.DataFrame() out_mztab_PEH = pr.iloc[:, 0:10] out_mztab_PEH = out_mztab_PEH.drop(["Protein.Group", "Protein.Names", "First.Protein.Description", "Proteotypic"], axis = 1) @@ -491,7 +482,6 @@ def mztab_PSH(unique, unimod_data, report, database): :return: PSH sub-table :rtype: pandas.core.frame.DataFrame """ - out_mztab_PSH = pd.DataFrame() ## Score at PSM level: Q.Value out_mztab_PSH = report[["Stripped.Sequence", "Protein.Ids", "Genes", "Q.Value", "RT", @@ -536,7 +526,6 @@ def add_info(target, index_ref): :return: A tuple contains ms_run and study_variable :rtype: tuple """ - match = index_ref[index_ref["run"] == target] ms_run = match["ms_run"].values[0] study_variable = match["study_variable"].values[0] @@ -558,7 +547,6 @@ def classify_protein(target, unique, t, f): :return: The signature of genes :rtype: str """ - if any(unique == target): return t else: @@ -577,7 +565,6 @@ def calculate_protein_coverage(report, target, fasta_pd): :return: Protein coverage :rtype: str """ - protein_coverage = "" len_current = len(max(report[report["Protein.Ids"] == target]["Stripped.Sequence"].values, key = len)) for i in target.split(";"): @@ -603,7 +590,6 @@ def match_in_report(report, target, max, flag, level): :return: A tuple contains multiple messages :rtype: tuple """ - if flag == 1 and level == 'pep': result = report[report['precursor.Index'] == target] PEH_params = [] @@ -642,7 +628,6 @@ def PRH_match_report(report, target): :return: A tuple contains multiple information to construct PRH sub-table :rtype: tuple """ - match = report[report["Protein.Ids"] == target] modSeq = match["Modified.Sequence"].values[0] if match["Modified.Sequence"].values.size > 0 else np.nan ## Score at protein level: Global.PG.Q.Value (without MBR) @@ -661,7 +646,6 @@ def PEH_match_report(report, target): :return: A tuple contains multiple information to construct PEH sub-table :rtype: tuple """ - match = report[report["precursor.Index"] == target] ## Score at peptide level: the minimum of the respective precursor q-values (minimum of Q.Value per group) search_score = match["Q.Value"].min() @@ -681,7 +665,6 @@ def find_modification(peptide): :return: Modification sites :rtype: str """ - pattern = re.compile(r"\((.*?)\)") original_mods = re.findall(pattern, peptide) peptide = re.sub('\(.*?\)','.',peptide) diff --git a/modules/local/diannconvert/main.nf b/modules/local/diannconvert/main.nf index 225e114c..4b8406ed 100644 --- a/modules/local/diannconvert/main.nf +++ b/modules/local/diannconvert/main.nf @@ -15,7 +15,7 @@ process DIANNCONVERT { path(report_pg) path(report_pr) path(report_unique_gene) - path(openms) + val(meta) path(fasta) val(charge) val(missed_cleavages) @@ -23,24 +23,26 @@ process DIANNCONVERT { output: path "*msstats_in.csv", emit: out_msstats path "*triqler_in.tsv", emit: out_triqler - path "*out.mztab", emit: out_mztab + path "*.mztab", emit: out_mztab path "versions.yml", emit: version script: def args = task.ext.args ?: '' + def dia_params = [meta.fragmentmasstolerance, meta.fragmentmasstoleranceunit, meta.precursormasstolerance, + meta.precursormasstoleranceunit, meta.enzyme, meta.fixedmodifications, meta.variablemodifications].join(';') """ diann_convert.py convert \\ - --diann_report ${report} \\ - --exp_design ${exp_design} \\ - --pg_matrix ${report_pg} \\ - --pr_matrix ${report_pr} \\ - --unique_matrix ${report_unique_gene} \\ - --openms ${openms} \\ - --fasta ${fasta} \\ + --diann_report "${report}" \\ + --exp_design "${exp_design}" \\ + --pg_matrix "${report_pg}" \\ + --pr_matrix "${report_pr}" \\ + --unique_matrix "${report_unique_gene}" \\ + --dia_params "${dia_params}" \\ + --fasta "${fasta}" \\ --charge ${charge} \\ --missed_cleavages ${missed_cleavages} \\ - --qvalue_threshold $params.protein_level_fdr_cutoff \\ + --qvalue_threshold ${params.protein_level_fdr_cutoff} \\ |& tee convert_report.log cat <<-END_VERSIONS > versions.yml diff --git a/subworkflows/local/create_input_channel.nf b/subworkflows/local/create_input_channel.nf index 8269ee66..e875b12d 100644 --- a/subworkflows/local/create_input_channel.nf +++ b/subworkflows/local/create_input_channel.nf @@ -54,7 +54,6 @@ workflow CREATE_INPUT_CHANNEL { ch_meta_config_iso // [meta, [spectra_files ]] ch_meta_config_lfq // [meta, [spectra_files ]] ch_meta_config_dia // [meta, [spectra files ]] - ch_config ch_expdesign version = ch_versions diff --git a/workflows/dia.nf b/workflows/dia.nf index 00cabbab..a290d4f9 100644 --- a/workflows/dia.nf +++ b/workflows/dia.nf @@ -34,7 +34,6 @@ workflow DIA { take: ch_file_preparation_results ch_expdesign - ch_config main: @@ -86,7 +85,8 @@ workflow DIA { // // MODULE: DIANNCONVERT // - DIANNCONVERT(DIANNSUMMARY.out.main_report, ch_expdesign, DIANNSUMMARY.out.pg_matrix, DIANNSUMMARY.out.pr_matrix, DIANNSUMMARY.out.unique_gene_matrix, ch_config, params.database, params.max_precursor_charge, params.allowed_missed_cleavages) + DIANNCONVERT(DIANNSUMMARY.out.main_report, ch_expdesign, DIANNSUMMARY.out.pg_matrix, DIANNSUMMARY.out.pr_matrix, DIANNSUMMARY.out.unique_gene_matrix, + ch_result.meta.first(), params.database, params.max_precursor_charge, params.allowed_missed_cleavages) ch_software_versions = ch_software_versions.mix(DIANNCONVERT.out.version.ifEmpty(null)) // diff --git a/workflows/quantms.nf b/workflows/quantms.nf index 529ba9b0..0970cd88 100644 --- a/workflows/quantms.nf +++ b/workflows/quantms.nf @@ -151,7 +151,7 @@ workflow QUANTMS { ch_msstats_in = ch_msstats_in.mix(LFQ.out.msstats_in) ch_versions = ch_versions.mix(LFQ.out.versions.ifEmpty(null)) - DIA(ch_fileprep_result.dia, CREATE_INPUT_CHANNEL.out.ch_expdesign, CREATE_INPUT_CHANNEL.out.ch_config) + DIA(ch_fileprep_result.dia, CREATE_INPUT_CHANNEL.out.ch_expdesign) ch_pipeline_results = ch_pipeline_results.mix(DIA.out.diann_report) ch_msstats_in = ch_msstats_in.mix(DIA.out.msstats_in) ch_versions = ch_versions.mix(DIA.out.versions.ifEmpty(null)) From 6cefcfffcf0c8e0a259b604c9f5977897f0c057d Mon Sep 17 00:00:00 2001 From: Hong Wang <88552471+WangHong007@users.noreply.github.com> Date: Mon, 4 Jul 2022 18:55:49 +0800 Subject: [PATCH 36/47] Update main.nf --- modules/local/diannconvert/main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/local/diannconvert/main.nf b/modules/local/diannconvert/main.nf index 4b8406ed..fdcc0700 100644 --- a/modules/local/diannconvert/main.nf +++ b/modules/local/diannconvert/main.nf @@ -28,8 +28,8 @@ process DIANNCONVERT { script: def args = task.ext.args ?: '' - def dia_params = [meta.fragmentmasstolerance, meta.fragmentmasstoleranceunit, meta.precursormasstolerance, - meta.precursormasstoleranceunit, meta.enzyme, meta.fixedmodifications, meta.variablemodifications].join(';') + def dia_params = [meta.fragmentmasstolerance,meta.fragmentmasstoleranceunit,meta.precursormasstolerance, + meta.precursormasstoleranceunit,meta.enzyme,meta.fixedmodifications,meta.variablemodifications].join(';') """ diann_convert.py convert \\ From b0101562e1de2d8fcb04925a7c783e7765be6199 Mon Sep 17 00:00:00 2001 From: WangHong007 <88552471+WangHong007@users.noreply.github.com> Date: Tue, 12 Jul 2022 21:31:00 +0800 Subject: [PATCH 37/47] disable Bio --- bin/diann_convert.py | 56 +++++++------------------------------------- 1 file changed, 9 insertions(+), 47 deletions(-) diff --git a/bin/diann_convert.py b/bin/diann_convert.py index 2509f669..1340fcef 100755 --- a/bin/diann_convert.py +++ b/bin/diann_convert.py @@ -5,8 +5,6 @@ import os import re import numpy as np -from Bio import SeqIO -from Bio.SeqUtils import molecular_weight from sdrf_pipelines.openms.unimod import UnimodDatabase CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) @@ -56,7 +54,6 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, pr = pd.read_csv(pr_matrix, sep = "\t", header = 0, dtype = "str") unique = pd.read_csv(unique_matrix, sep = "\t", header = 0, dtype = "str") report = pd.read_csv(diann_report, sep = "\t", header = 0, dtype = "str") - report["Calculate.Precursor.Mz"] = report.apply(lambda x: molecular_weight(x["Stripped.Sequence"], 'protein', monoisotopic=True) / int(x["Precursor.Charge"]), axis=1) precursor_list = list(report["Precursor.Id"].unique()) report["precursor.Index"] = report.apply(lambda x: precursor_list.index(x["Precursor.Id"]), axis=1) @@ -107,14 +104,6 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, out_triqler.to_csv(os.path.splitext(os.path.basename(exp_design))[0] + '_triqler_in.tsv', sep='\t', index=False) # Convert to mzTab - fasta_pd = pd.DataFrame() - line = 0 - for seq_record in SeqIO.parse(fasta,"fasta"): - fasta_pd.loc[line, "id"] = seq_record.id - fasta_pd.loc[line, "seq"] = re.sub("\(|\)|\,", "", str(seq_record.seq)) - fasta_pd.loc[line, "len"] = len(seq_record) - line += 1 - index_ref = f_table index_ref.loc[:, "ms_run"] = index_ref.apply(lambda x: x["Fraction_Group"], axis=1) index_ref.loc[:, "study_variable"] = index_ref.apply(lambda x: x["Sample"], axis=1) @@ -123,7 +112,7 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, report[["ms_run", "study_variable"]] = report.apply(lambda x: add_info(x["Run"], index_ref), axis = 1, result_type = "expand") (MTD, database) = mztab_MTD(index_ref, dia_params, unimod_data, fasta, charge, missed_cleavages) - PRH = mztab_PRH(report, pg, unique, index_ref, database, fasta_pd) + PRH = mztab_PRH(report, pg, unique, index_ref, database) PEH = mztab_PEH(report, pr, unique, unimod_data, precursor_list, index_ref, database) PSH = mztab_PSH(unique, unimod_data, report, database) MTD.loc["", :] = "" @@ -325,7 +314,7 @@ def mztab_MTD(index_ref, dia_params, unimod_data, fasta, charge, missed_cleavage return out_mztab_MTD_T, database -def mztab_PRH(report, pg, unique, index_ref, database, fasta_pd): +def mztab_PRH(report, pg, unique, index_ref, database): """Construct PRH sub-table. :param report: Dataframe for Dia-NN main report @@ -338,8 +327,6 @@ def mztab_PRH(report, pg, unique, index_ref, database, fasta_pd): :type indx_ref: pandas.core.frame.DataFrame :param database: Path to fasta file :type database: str - :param fasta_pd: A dataframe contains protein IDs, sequences and lengths - :type fasta_pd: pandas.core.frame.DataFrame :return: PRH sub-table :rtype: pandas.core.frame.DataFrame """ @@ -359,13 +346,10 @@ def mztab_PRH(report, pg, unique, index_ref, database, fasta_pd): out_mztab_PRH.loc[:, "database"] = database null_col = ["taxid", "species", "database_version", "search_engine", "opt_global_Posterior_Probability_score", - "opt_global_nr_found_peptides", "opt_global_cv_PRIDE:0000303_decoy_hit"] + "opt_global_nr_found_peptides", "opt_global_cv_PRIDE:0000303_decoy_hit", "protein_coverage"] for i in null_col: out_mztab_PRH.loc[:, i] = "null" - out_mztab_PRH.loc[:, "protein_coverage"] = out_mztab_PRH.apply( - lambda x: calculate_protein_coverage(report, x["accession"], fasta_pd), axis=1, result_type="expand") - out_mztab_PRH.loc[:, "ambiguity_members"] = out_mztab_PRH.loc[:, "accession"] out_mztab_PRH[["modifiedSequence", "best_search_engine_score[1]"]] = out_mztab_PRH.apply( @@ -433,7 +417,7 @@ def mztab_PEH(report, pr, unique, unimod_data, precursor_list, index_ref, databa out_mztab_PEH.loc[:, "unique"] = out_mztab_PEH.apply(lambda x: classify_protein(x["Genes"], unique["Genes"], "1", "0"), axis=1, result_type="expand") - null_col = ["database_version", "search_engine", "retention_time_window"] + null_col = ["database_version", "search_engine", "retention_time_window", "mass_to_charge"] for i in null_col: out_mztab_PEH.loc[:, i] = "null" out_mztab_PEH.loc[:, "opt_global_cv_MS:1002217_decoy_peptide"] = "0" @@ -457,7 +441,7 @@ def mztab_PEH(report, pr, unique, unimod_data, precursor_list, index_ref, databa out_mztab_PEH[PEH_params] = out_mztab_PEH.apply( lambda x: match_in_report(report, x['pr_id'], max_study_variable, 1, 'pep'), axis=1, result_type='expand') - out_mztab_PEH[["best_search_engine_score[1]", "retention_time", "opt_global_q-value", "opt_global_SpecEValue_score", "mass_to_charge" + out_mztab_PEH[["best_search_engine_score[1]", "retention_time", "opt_global_q-value", "opt_global_SpecEValue_score" ]] = out_mztab_PEH.apply(lambda x: PEH_match_report(report, x['pr_id']), axis=1, result_type="expand") out_mztab_PEH[["opt_global_feature_id", "spectra_ref"]] = out_mztab_PEH.apply(lambda x:("null", "null"),axis = 1, result_type = "expand") @@ -485,8 +469,8 @@ def mztab_PSH(unique, unimod_data, report, database): out_mztab_PSH = pd.DataFrame() ## Score at PSM level: Q.Value out_mztab_PSH = report[["Stripped.Sequence", "Protein.Ids", "Genes", "Q.Value", "RT", - "Precursor.Charge", "Calculate.Precursor.Mz", "Modified.Sequence", "PEP", "Global.Q.Value", "Global.Q.Value"]] - out_mztab_PSH.columns = ["sequence", "accession", "Genes", "search_engine_score[1]", "retention_time", "charge", "calc_mass_to_charge", + "Precursor.Charge", "Modified.Sequence", "PEP", "Global.Q.Value", "Global.Q.Value"]] + out_mztab_PSH.columns = ["sequence", "accession", "Genes", "search_engine_score[1]", "retention_time", "charge", "opt_global_cv_MS:1000889_peptidoform_sequence", "opt_global_SpecEValue_score", "opt_global_q-value", "opt_global_q-value_score"] out_mztab_PSH.loc[:, "opt_global_cv_MS:1002217_decoy_peptide"] = "0" @@ -499,7 +483,7 @@ def mztab_PSH(unique, unimod_data, report, database): out_mztab_PSH.loc[:, "database"] = database null_col = ["database_version", "spectra_ref", "search_engine", "exp_mass_to_charge", "pre", "post", - "start", "end", "opt_global_feature_id", "opt_global_map_index", "opt_global_spectrum_reference"] + "start", "end", "calc_mass_to_charge", "opt_global_feature_id", "opt_global_map_index", "opt_global_spectrum_reference"] for i in null_col: out_mztab_PSH.loc[:, i] = "null" @@ -553,27 +537,6 @@ def classify_protein(target, unique, t, f): return f -def calculate_protein_coverage(report, target, fasta_pd): - """Calculate protein coverage. - - :param report: Dataframe for Dia-NN main report - :type report: pandas.core.frame.DataFrame - :param target: The "accession" column in out_mztab_PRH - :type target: pandas.core.series.Series - :param fasta_pd: A dataframe contains protein IDs, sequences and lengths - :type fasta_pd: pandas.core.frame.DataFrame - :return: Protein coverage - :rtype: str - """ - protein_coverage = "" - len_current = len(max(report[report["Protein.Ids"] == target]["Stripped.Sequence"].values, key = len)) - for i in target.split(";"): - len_original = fasta_pd[fasta_pd["id"].str.contains(i)]["len"].values[0] - protein_coverage += format(len_current / len_original, ".3f") + ";" - - return protein_coverage.strip(";") - - def match_in_report(report, target, max, flag, level): """This function is used to match the columns "ms_run" and "study_variable" in the report to get the information. @@ -652,9 +615,8 @@ def PEH_match_report(report, target): time = match["RT"].mean() q_score = match["Global.Q.Value"].values[0] if match["Global.Q.Value"].values.size > 0 else np.nan spec_e = match["Lib.Q.Value"].values[0] if match["Lib.Q.Value"].values.size > 0 else np.nan - mz = match["Calculate.Precursor.Mz"].mean() - return search_score, time, q_score, spec_e, mz + return search_score, time, q_score, spec_e def find_modification(peptide): From 71d481f8182bca4ddd5040c3550703b83c9e23ff Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 29 Jul 2022 09:44:12 +0100 Subject: [PATCH 38/47] increase the multiqc version --- modules/nf-core/modules/multiqc/main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/nf-core/modules/multiqc/main.nf b/modules/nf-core/modules/multiqc/main.nf index 1264aac1..6c81dfb9 100644 --- a/modules/nf-core/modules/multiqc/main.nf +++ b/modules/nf-core/modules/multiqc/main.nf @@ -1,10 +1,10 @@ process MULTIQC { label 'process_medium' - conda (params.enable_conda ? 'bioconda::multiqc=1.12' : null) + conda (params.enable_conda ? 'bioconda::multiqc=1.13a' : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.12--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.12--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.13a--pyhdfd78af_1' : + 'quay.io/biocontainers/multiqc:1.13a--pyhdfd78af_1' }" input: path multiqc_files From e1941728bfc173d89251fbe07277103a50cfc70f Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 29 Jul 2022 09:53:18 +0100 Subject: [PATCH 39/47] back to version 1.12 --- modules/nf-core/modules/multiqc/main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/nf-core/modules/multiqc/main.nf b/modules/nf-core/modules/multiqc/main.nf index 6c81dfb9..1264aac1 100644 --- a/modules/nf-core/modules/multiqc/main.nf +++ b/modules/nf-core/modules/multiqc/main.nf @@ -1,10 +1,10 @@ process MULTIQC { label 'process_medium' - conda (params.enable_conda ? 'bioconda::multiqc=1.13a' : null) + conda (params.enable_conda ? 'bioconda::multiqc=1.12' : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.13a--pyhdfd78af_1' : - 'quay.io/biocontainers/multiqc:1.13a--pyhdfd78af_1' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.12--pyhdfd78af_0' : + 'quay.io/biocontainers/multiqc:1.12--pyhdfd78af_0' }" input: path multiqc_files From 78992017264d79b1e87507863d8e55da1deba632 Mon Sep 17 00:00:00 2001 From: WangHong007 <88552471+WangHong007@users.noreply.github.com> Date: Sat, 30 Jul 2022 17:03:55 +0800 Subject: [PATCH 40/47] Update diann_convert.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace Bio with pyopenms, disable unique genes matrix and some small fixs --- bin/diann_convert.py | 133 +++++++++++++++++++++++++++---------------- 1 file changed, 84 insertions(+), 49 deletions(-) diff --git a/bin/diann_convert.py b/bin/diann_convert.py index 1340fcef..1c22d833 100755 --- a/bin/diann_convert.py +++ b/bin/diann_convert.py @@ -1,10 +1,12 @@ #!/usr/bin/env python +from ast import AnnAssign import pandas as pd import click import os import re import numpy as np +from pyopenms import * from sdrf_pipelines.openms.unimod import UnimodDatabase CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) @@ -17,7 +19,6 @@ def cli(): @click.option("--exp_design", "-e") @click.option("--pg_matrix", "-pg") @click.option("--pr_matrix", "-pr") -@click.option("--unique_matrix", "-un") @click.option("--dia_params", "-p") @click.option("--fasta", "-f") @click.option("--charge", "-c") @@ -25,7 +26,7 @@ def cli(): @click.option("--qvalue_threshold", "-q", type=float) @click.pass_context -def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, dia_params, fasta, charge, missed_cleavages, qvalue_threshold): +def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, dia_params, fasta, charge, missed_cleavages, qvalue_threshold): """This function is designed to convert the DIA-NN output into three standard formats: MSstats, Triqler and mzTab. These documents are used for quality control and downstream analysis. @@ -37,8 +38,6 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, :type pg_matrix: str :param pr_matrix: Path to a DIA-NN matrix file containing precursors :type pr_matrix: str - :param unique_matrix: Path to a DIA-NN matrix file containing unique genes - :type unique_matrix: str :param dia_params: A list contains DIA parameters :type dia_params: list :param fasta: Path to the fasta file @@ -51,9 +50,11 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, :type qvalue_threshold: float """ pg = pd.read_csv(pg_matrix, sep = "\t", header = 0, dtype = "str") - pr = pd.read_csv(pr_matrix, sep = "\t", header = 0, dtype = "str") - unique = pd.read_csv(unique_matrix, sep = "\t", header = 0, dtype = "str") + pr = pd.read_csv(pr_matrix, sep = "\t", header = 0, dtype = "str") report = pd.read_csv(diann_report, sep = "\t", header = 0, dtype = "str") + report["Calculate.Precursor.Mz"] = report.apply( + lambda x: AASequence.fromString(x["Stripped.Sequence"]).getMZ(int(x["Precursor.Charge"])), axis=1) + precursor_list = list(report["Precursor.Id"].unique()) report["precursor.Index"] = report.apply(lambda x: precursor_list.index(x["Precursor.Id"]), axis=1) @@ -104,6 +105,17 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, out_triqler.to_csv(os.path.splitext(os.path.basename(exp_design))[0] + '_triqler_in.tsv', sep='\t', index=False) # Convert to mzTab + fasta_pd = pd.DataFrame() + entries = [] + f = FASTAFile() + f.load(fasta, entries) + line = 0 + for e in entries: + fasta_pd.loc[line, "id"] = e.identifier + fasta_pd.loc[line, "seq"] = e.sequence + fasta_pd.loc[line, "len"] = len(e.sequence) + line += 1 + index_ref = f_table index_ref.loc[:, "ms_run"] = index_ref.apply(lambda x: x["Fraction_Group"], axis=1) index_ref.loc[:, "study_variable"] = index_ref.apply(lambda x: x["Sample"], axis=1) @@ -112,9 +124,9 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, report[["ms_run", "study_variable"]] = report.apply(lambda x: add_info(x["Run"], index_ref), axis = 1, result_type = "expand") (MTD, database) = mztab_MTD(index_ref, dia_params, unimod_data, fasta, charge, missed_cleavages) - PRH = mztab_PRH(report, pg, unique, index_ref, database) - PEH = mztab_PEH(report, pr, unique, unimod_data, precursor_list, index_ref, database) - PSH = mztab_PSH(unique, unimod_data, report, database) + PRH = mztab_PRH(report, pg, index_ref, database, fasta_pd) + PEH = mztab_PEH(report, pr, unimod_data, precursor_list, index_ref, database) + PSH = mztab_PSH(unimod_data, report, database) MTD.loc["", :] = "" PRH.loc[len(PRH) + 1, :] = "" PEH.loc[len(PEH) + 1, :] = "" @@ -314,19 +326,19 @@ def mztab_MTD(index_ref, dia_params, unimod_data, fasta, charge, missed_cleavage return out_mztab_MTD_T, database -def mztab_PRH(report, pg, unique, index_ref, database): +def mztab_PRH(report, pg, index_ref, database, fasta_pd): """Construct PRH sub-table. :param report: Dataframe for Dia-NN main report :type report: pandas.core.frame.DataFrame :param pg: Dataframe for Dia-NN protein groups matrix :type pg: pandas.core.frame.DataFrame - :param unique: Dataframe for Dia-NN unique genes matrix - :type unique: pandas.core.frame.DataFrame :param index_ref: On the basis of f_table, two columns "ms_run" and "study_variable" are added for matching :type indx_ref: pandas.core.frame.DataFrame :param database: Path to fasta file :type database: str + :param fasta_pd: A dataframe contains protein IDs, sequences and lengths + :type fasta_pd: pandas.core.frame.DataFrame :return: PRH sub-table :rtype: pandas.core.frame.DataFrame """ @@ -336,9 +348,11 @@ def mztab_PRH(report, pg, unique, index_ref, database): col[i] = "protein_abundance_assay[" + str(index_ref[index_ref["run"] == i.split("/")[-1].split(".")[-2]]["ms_run"].values[0]) + "]" pg = pg.rename(columns = col) + pg.loc[:, "opt_global_result_type"] = pg.apply(lambda x: classify_result_type(x), axis=1, result_type="expand") + out_mztab_PRH = pd.DataFrame() - out_mztab_PRH = pg.drop(["Protein.Group", "Protein.Names"], axis = 1) - out_mztab_PRH = out_mztab_PRH.rename(columns = {"Protein.Ids":"accession", "First.Protein.Description":"description"}) + out_mztab_PRH = pg.drop(["Protein.Names"], axis = 1) + out_mztab_PRH = out_mztab_PRH.rename(columns = {"Protein.Group":"accession", "First.Protein.Description":"description"}) out_mztab_PRH.loc[:, "PRH"] = "PRT" index = out_mztab_PRH.loc[:, "PRH"] out_mztab_PRH.drop(labels = ["PRH"], axis = 1, inplace = True) @@ -346,18 +360,27 @@ def mztab_PRH(report, pg, unique, index_ref, database): out_mztab_PRH.loc[:, "database"] = database null_col = ["taxid", "species", "database_version", "search_engine", "opt_global_Posterior_Probability_score", - "opt_global_nr_found_peptides", "opt_global_cv_PRIDE:0000303_decoy_hit", "protein_coverage"] + "opt_global_nr_found_peptides", "opt_global_cv_PRIDE:0000303_decoy_hit"] for i in null_col: out_mztab_PRH.loc[:, i] = "null" - + out_mztab_PRH.loc[:, "ambiguity_members"] = out_mztab_PRH.loc[:, "accession"] + prh_series = out_mztab_PRH["accession"].str.split(";", expand = True).stack().reset_index(level = 1, drop = True) + prh_series.name = "accession" + out_mztab_PRH = out_mztab_PRH.drop("accession", axis = 1).join(prh_series).reset_index().drop(columns = "index") + out_mztab_PRH.loc[:, "protein_coverage"] = out_mztab_PRH.apply( + lambda x: calculate_protein_coverage(report, x["accession"], x["Protein.Ids"], fasta_pd), axis=1, result_type="expand") + + out_mztab_PRH.loc[:, "ambiguity_members"] = out_mztab_PRH.apply( + lambda x: "null" if x["opt_global_result_type"] == "protein_details" else x["ambiguity_members"], axis=1) + out_mztab_PRH[["modifiedSequence", "best_search_engine_score[1]"]] = out_mztab_PRH.apply( lambda x: PRH_match_report(report, x["accession"]), axis=1, result_type="expand") - + out_mztab_PRH.loc[:, "modifications"] = out_mztab_PRH.apply( lambda x: find_modification(x["modifiedSequence"]), axis=1, result_type="expand") - + ## quantity at protein level: PG.MaxLFQ max_study_variable = max(index_ref["study_variable"]) PRH_params = [] @@ -368,25 +391,20 @@ def mztab_PRH(report, pg, unique, index_ref, database): out_mztab_PRH[PRH_params] = out_mztab_PRH.apply( lambda x: match_in_report(report, x['accession'], max_study_variable, 1, 'protein'), axis=1, result_type='expand') - out_mztab_PRH.loc[:, "opt_global_result_type"] = out_mztab_PRH.apply( - lambda x: classify_protein(x["Genes"], unique["Genes"], "single_protein", "indistinguishable_protein_group"), axis=1, result_type="expand") - - out_mztab_PRH = out_mztab_PRH.drop(["Genes", "modifiedSequence"], axis=1) + out_mztab_PRH = out_mztab_PRH.drop(["Genes", "modifiedSequence", "Protein.Ids"], axis=1) out_mztab_PRH.fillna("null", inplace=True) # out_mztab_PRH.to_csv("./out_protein.mztab", sep=",", index=False) return out_mztab_PRH -def mztab_PEH(report, pr, unique, unimod_data, precursor_list, index_ref, database): +def mztab_PEH(report, pr, unimod_data, precursor_list, index_ref, database): """Construct PEH sub-table. :param report: Dataframe for Dia-NN main report :type report: pandas.core.frame.DataFrame :param pr: Dataframe for Dia-NN precursors matrix :type pr: pandas.core.frame.DataFrame - :param unique: Dataframe for Dia-NN unique genes matrix - :type unique: pandas.core.frame.DataFrame :param unimod_data: An instance of class UnimodDatabase :type unimod_data: sdrf_pipelines.openms.unimod.UnimodDatabase :param precursor_list: A list contains all precursor IDs @@ -415,7 +433,7 @@ def mztab_PEH(report, pr, unique, unimod_data, precursor_list, index_ref, databa out_mztab_PEH.loc[:, "opt_global_cv_MS:1000889_peptidoform_sequence"] = out_mztab_PEH.apply( lambda x: convert_modification(x["opt_global_cv_MS:1000889_peptidoform_sequence"], unimod_data), axis=1) - out_mztab_PEH.loc[:, "unique"] = out_mztab_PEH.apply(lambda x: classify_protein(x["Genes"], unique["Genes"], "1", "0"), axis=1, result_type="expand") + out_mztab_PEH.loc[:, "unique"] = out_mztab_PEH.apply(lambda x: "0" if ";" in x["accession"] else "1", axis=1, result_type="expand") null_col = ["database_version", "search_engine", "retention_time_window", "mass_to_charge"] for i in null_col: @@ -441,7 +459,7 @@ def mztab_PEH(report, pr, unique, unimod_data, precursor_list, index_ref, databa out_mztab_PEH[PEH_params] = out_mztab_PEH.apply( lambda x: match_in_report(report, x['pr_id'], max_study_variable, 1, 'pep'), axis=1, result_type='expand') - out_mztab_PEH[["best_search_engine_score[1]", "retention_time", "opt_global_q-value", "opt_global_SpecEValue_score" + out_mztab_PEH[["best_search_engine_score[1]", "retention_time", "opt_global_q-value", "opt_global_SpecEValue_score", "mass_to_charge" ]] = out_mztab_PEH.apply(lambda x: PEH_match_report(report, x['pr_id']), axis=1, result_type="expand") out_mztab_PEH[["opt_global_feature_id", "spectra_ref"]] = out_mztab_PEH.apply(lambda x:("null", "null"),axis = 1, result_type = "expand") @@ -452,11 +470,9 @@ def mztab_PEH(report, pr, unique, unimod_data, precursor_list, index_ref, databa return out_mztab_PEH -def mztab_PSH(unique, unimod_data, report, database): +def mztab_PSH(unimod_data, report, database): """Construct PSH sub-table. - :param unique: Dataframe for Dia-NN unique genes matrix - :type unique: pandas.core.frame.DataFrame :param unimod_data: An instance of class UnimodDatabase :type unimod_data: sdrf_pipelines.openms.unimod.UnimodDatabase :param report: Dataframe for Dia-NN main report @@ -469,8 +485,8 @@ def mztab_PSH(unique, unimod_data, report, database): out_mztab_PSH = pd.DataFrame() ## Score at PSM level: Q.Value out_mztab_PSH = report[["Stripped.Sequence", "Protein.Ids", "Genes", "Q.Value", "RT", - "Precursor.Charge", "Modified.Sequence", "PEP", "Global.Q.Value", "Global.Q.Value"]] - out_mztab_PSH.columns = ["sequence", "accession", "Genes", "search_engine_score[1]", "retention_time", "charge", + "Precursor.Charge", "Calculate.Precursor.Mz", "Modified.Sequence", "PEP", "Global.Q.Value", "Global.Q.Value"]] + out_mztab_PSH.columns = ["sequence", "accession", "Genes", "search_engine_score[1]", "retention_time", "charge", "calc_mass_to_charge", "opt_global_cv_MS:1000889_peptidoform_sequence", "opt_global_SpecEValue_score", "opt_global_q-value", "opt_global_q-value_score"] out_mztab_PSH.loc[:, "opt_global_cv_MS:1002217_decoy_peptide"] = "0" @@ -479,11 +495,11 @@ def mztab_PSH(unique, unimod_data, report, database): out_mztab_PSH.drop(labels = ["PSH"], axis = 1, inplace = True) out_mztab_PSH.insert(0, "PSH", index) out_mztab_PSH.loc[:, "PSM_ID"] = out_mztab_PSH.index - out_mztab_PSH.loc[:, "unique"] = out_mztab_PSH.apply(lambda x: classify_protein(x["Genes"], unique["Genes"], "1", "0"), axis=1, result_type="expand") + out_mztab_PSH.loc[:, "unique"] = out_mztab_PSH.apply(lambda x: "0" if ";" in x["accession"] else "1", axis=1, result_type="expand") out_mztab_PSH.loc[:, "database"] = database null_col = ["database_version", "spectra_ref", "search_engine", "exp_mass_to_charge", "pre", "post", - "start", "end", "calc_mass_to_charge", "opt_global_feature_id", "opt_global_map_index", "opt_global_spectrum_reference"] + "start", "end", "opt_global_feature_id", "opt_global_map_index", "opt_global_spectrum_reference"] for i in null_col: out_mztab_PSH.loc[:, i] = "null" @@ -517,24 +533,41 @@ def add_info(target, index_ref): return ms_run, study_variable -def classify_protein(target, unique, t, f): - """Proteins are classified according to whether the corresponding gene is unique or not. +def classify_result_type(target): + """Classify proteins - :param target: The "Genes" column in out_mztab_PSH - :type target: pandas.core.series.Series - :param unique: The "Genes" column in unique(Dataframe for Dia-NN unique genes matrix) - :type unique: pandas.core.series.Series - :param t: The signature of the unique gene - :type t: str - :param f: The signature of the NOT unique gene - :type f: str - :return: The signature of genes + :param target: The target dataframe contains "Protein.Group" and "Protein.Ids" + :type target: pandas.core.frame.DataFrame + :return: A string implys protein type :rtype: str """ - if any(unique == target): - return t + if ";" in target["Protein.Ids"]: + if ";" in target["Protein.Group"]: + return "protein_details" + else: + return "indistinguishable_protein_group" else: - return f + return "single_protein" + + +def calculate_protein_coverage(report, target, reference, fasta_pd): + """Calculate protein coverage. + :param report: Dataframe for Dia-NN main report + :type report: pandas.core.frame.DataFrame + :param target: The "accession" column in out_mztab_PRH + :type target: pandas.core.series.Series + :param target: The "Protein.Ids" column in report + :type target: pandas.core.series.Series + :param fasta_pd: A dataframe contains protein IDs, sequences and lengths + :type fasta_pd: pandas.core.frame.DataFrame + :return: Protein coverage + :rtype: str + """ + len_current = len(max(report[report["Protein.Ids"] == reference]["Stripped.Sequence"].values, key = len)) + len_original = fasta_pd[fasta_pd["id"].str.contains(target)]["len"].values[0] + protein_coverage = format(len_current / len_original, ".3f") + + return protein_coverage def match_in_report(report, target, max, flag, level): @@ -615,8 +648,9 @@ def PEH_match_report(report, target): time = match["RT"].mean() q_score = match["Global.Q.Value"].values[0] if match["Global.Q.Value"].values.size > 0 else np.nan spec_e = match["Lib.Q.Value"].values[0] if match["Lib.Q.Value"].values.size > 0 else np.nan + mz = match["Calculate.Precursor.Mz"].mean() - return search_score, time, q_score, spec_e + return search_score, time, q_score, spec_e, mz def find_modification(peptide): @@ -627,6 +661,7 @@ def find_modification(peptide): :return: Modification sites :rtype: str """ + peptide = str(peptide) pattern = re.compile(r"\((.*?)\)") original_mods = re.findall(pattern, peptide) peptide = re.sub('\(.*?\)','.',peptide) From 19223d36df7d17a1b85a640dd40a4c9bbcc504c7 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 2 Aug 2022 10:59:36 +0100 Subject: [PATCH 41/47] added decoys to the idfilter in TMT --- conf/modules.config | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index dab69338..8293818a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -176,12 +176,13 @@ process { // IDFILTER on PROTEIN LEVEL level = params.protein_quant == 'strictly_unique_peptides' ? 'prot' : 'protgroup' + def decoys_present = params.quantify_decoys ? '' : '-remove_decoys' withName: '.*:TMT:PROTEININFERENCE:IDFILTER' { ext.args = [ "-score:${level} \"$params.protein_level_fdr_cutoff\"", "-score:pep \"$params.psm_level_fdr_cutoff\"", "-delete_unreferenced_peptide_hits", - "-remove_decoys" + "${decoys_present}" ].join(' ').trim() ext.suffix = '.consensusXML' publishDir = [ From 0a9fd027a97ee41012f5718a1c881eac4a32a4e1 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 2 Aug 2022 11:03:34 +0100 Subject: [PATCH 42/47] added decoys to the idfilter in TMT --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 8293818a..98572169 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -176,7 +176,7 @@ process { // IDFILTER on PROTEIN LEVEL level = params.protein_quant == 'strictly_unique_peptides' ? 'prot' : 'protgroup' - def decoys_present = params.quantify_decoys ? '' : '-remove_decoys' + decoys_present = params.quantify_decoys ? '' : '-remove_decoys' withName: '.*:TMT:PROTEININFERENCE:IDFILTER' { ext.args = [ "-score:${level} \"$params.protein_level_fdr_cutoff\"", From 98ec03fc055076c94212c2050921d6f94dc99f4a Mon Sep 17 00:00:00 2001 From: WangHong007 <88552471+WangHong007@users.noreply.github.com> Date: Tue, 2 Aug 2022 19:02:55 +0800 Subject: [PATCH 43/47] Use pyopenms container Correct protein coverage and unique --- bin/diann_convert.py | 189 +++++++++++++++------------- modules/local/diannconvert/main.nf | 18 ++- modules/local/diannconvert/meta.yml | 23 +++- workflows/dia.nf | 4 +- 4 files changed, 134 insertions(+), 100 deletions(-) diff --git a/bin/diann_convert.py b/bin/diann_convert.py index 1c22d833..fcb88717 100755 --- a/bin/diann_convert.py +++ b/bin/diann_convert.py @@ -1,13 +1,11 @@ #!/usr/bin/env python -from ast import AnnAssign import pandas as pd import click import os import re import numpy as np from pyopenms import * -from sdrf_pipelines.openms.unimod import UnimodDatabase CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) @click.group(context_settings=CONTEXT_SETTINGS) @@ -65,7 +63,6 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, dia_params, fas # filter based on qvalue parameter for downstream analysiss report = report[report["Q.Value"] < qvalue_threshold] - unimod_data = UnimodDatabase() with open(exp_design, 'r') as f: data = f.readlines() empty_row = data.index('\n') @@ -82,7 +79,7 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, dia_params, fas out_msstats = pd.DataFrame() out_msstats = report[['Protein.Names', 'Modified.Sequence', 'Precursor.Charge', 'Precursor.Quantity', 'File.Name','Run']] out_msstats.columns = ['ProteinName', 'PeptideSequence', 'PrecursorCharge', 'Intensity', 'Reference', 'Run'] - out_msstats.loc[:,"PeptideSequence"] = out_msstats.apply(lambda x: convert_modification(x["PeptideSequence"], unimod_data), axis=1) + out_msstats.loc[:,"PeptideSequence"] = out_msstats.apply(lambda x: AASequence.fromString(x["PeptideSequence"]).toString(), axis=1) out_msstats.loc[:,"FragmentIon"] = 'NA' out_msstats.loc[:,"ProductCharge"] = '0' out_msstats.loc[:,"IsotopeLabelType"] = "L" @@ -105,15 +102,15 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, dia_params, fas out_triqler.to_csv(os.path.splitext(os.path.basename(exp_design))[0] + '_triqler_in.tsv', sep='\t', index=False) # Convert to mzTab - fasta_pd = pd.DataFrame() + fasta_df = pd.DataFrame() entries = [] f = FASTAFile() f.load(fasta, entries) line = 0 for e in entries: - fasta_pd.loc[line, "id"] = e.identifier - fasta_pd.loc[line, "seq"] = e.sequence - fasta_pd.loc[line, "len"] = len(e.sequence) + fasta_df.loc[line, "id"] = e.identifier + fasta_df.loc[line, "seq"] = e.sequence + fasta_df.loc[line, "len"] = len(e.sequence) line += 1 index_ref = f_table @@ -123,10 +120,10 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, dia_params, fas index_ref.loc[:, "study_variable"] = index_ref.loc[:, "study_variable"].astype("int") report[["ms_run", "study_variable"]] = report.apply(lambda x: add_info(x["Run"], index_ref), axis = 1, result_type = "expand") - (MTD, database) = mztab_MTD(index_ref, dia_params, unimod_data, fasta, charge, missed_cleavages) - PRH = mztab_PRH(report, pg, index_ref, database, fasta_pd) - PEH = mztab_PEH(report, pr, unimod_data, precursor_list, index_ref, database) - PSH = mztab_PSH(unimod_data, report, database) + (MTD, database) = mztab_MTD(index_ref, dia_params, fasta, charge, missed_cleavages) + PRH = mztab_PRH(report, pg, index_ref, database, fasta_df) + PEH = mztab_PEH(report, pr, precursor_list, index_ref, database, fasta_df) + PSH = mztab_PSH(report, database, fasta_df) MTD.loc["", :] = "" PRH.loc[len(PRH) + 1, :] = "" PEH.loc[len(PEH) + 1, :] = "" @@ -141,8 +138,8 @@ def query_expdesign_value(reference, f_table, s_table): """By matching the "Run" column in f_table or the "Sample" column in s_table, this function returns a tuple containing Fraction, BioReplicate and Condition. - :param reference: The "Run" column in out_msstats - :type reference: pandas.core.series.Series + :param reference: The value of "Run" column in out_msstats + :type reference: str :param f_table: A table contains experiment settings(search engine settings etc.) :type f_table: pandas.core.frame.DataFrame :param s_table: A table contains experimental design @@ -159,31 +156,9 @@ def query_expdesign_value(reference, f_table, s_table): return Fraction, BioReplicate, Condition -def convert_modification(peptide, unimod_data): - """Convert the accession in the peptide to name("Unimod:35" to "Oxidation" etc.). - - :param peptide: Peptide contains the modification accessions - :type peptide: str - :param unimod_data: An instance of class UnimodDatabase - :type unimod_data: sdrf_pipelines.openms.unimod.UnimodDatabase - :return: Peptide contains the modification names - :rtype: str - """ - pattern = re.compile(r"\((.*?)\)") - origianl_mods = re.findall(pattern, peptide) - for mod in set(origianl_mods): - name = unimod_data.get_by_accession(mod.upper()).get_name() - peptide = peptide.replace(mod, name) - if peptide.startswith("("): - peptide = peptide + "." - return peptide - - -def MTD_mod_info(unimod_database, fix_mod, var_mod): +def MTD_mod_info(fix_mod, var_mod): """Convert fixed and variable modifications to the format required by the MTD sub-table. - :param unimod_database: An instance of class UnimodDatabase - :type unimod_database: sdrf_pipelines.openms.unimod.UnimodDatabase :param fix_mod: Fixed modifications from DIA parameter list :type fix_mod: str :param var_mod: Variable modifications from DIA parameter list @@ -191,19 +166,17 @@ def MTD_mod_info(unimod_database, fix_mod, var_mod): :return: A tuple contains fixed and variable modifications, and flags indicating whether they are null :rtype: tuple """ - pattern = re.compile("\((.*?)\)") var_ptm = [] fix_ptm = [] + mods_db = ModificationsDB() if fix_mod != "null": fix_flag = 1 for mod in fix_mod.split(","): - for m in unimod_database.modifications: - if m.get_name() == mod.split(" ")[0]: - mod_name = m.get_name() - mod_accession = m.get_accession() - break - site = re.findall(pattern, " ".join(mod.split(" ")[1:]))[0] + mod_obj = mods_db.getModification(mod) + mod_name = mod_obj.getId() + mod_accession = mod_obj.getUniModAccession() + site = mod_obj.getOrigin() fix_ptm.append(("[UNIMOD, " + mod_accession.upper() + ", " + mod_name + ", ]", site)) else: fix_flag = 0 @@ -212,12 +185,10 @@ def MTD_mod_info(unimod_database, fix_mod, var_mod): if var_mod != "null": var_flag = 1 for mod in var_mod.split(","): - for m in unimod_database.modifications: - if m.get_name() == mod.split(" ")[0]: - mod_name = m.get_name() - mod_accession = m.get_accession() - break - site = re.findall(pattern, " ".join(mod.split(" ")[1:]))[0] + mod_obj = mods_db.getModification(mod) + mod_name = mod_obj.getId() + mod_accession = mod_obj.getUniModAccession() + site = mod_obj.getOrigin() var_ptm.append(("[UNIMOD, " + mod_accession.upper() + ", " + mod_name + ", ]", site)) else: var_flag = 0 @@ -226,15 +197,13 @@ def MTD_mod_info(unimod_database, fix_mod, var_mod): return fix_ptm, var_ptm, fix_flag, var_flag -def mztab_MTD(index_ref, dia_params, unimod_data, fasta, charge, missed_cleavages): +def mztab_MTD(index_ref, dia_params, fasta, charge, missed_cleavages): """Construct MTD sub-table. :param index_ref: On the basis of f_table, two columns "MS_run" and "study_variable" are added for matching :type indx_ref: pandas.core.frame.DataFrame :param dia_params: A list contains DIA parameters :type dia_params: list - :param unimod_data: An instance of class UnimodDatabase - :type unimod_data: sdrf_pipelines.openms.unimod.UnimodDatabase :param fasta: Fasta file path :type fasta: str :param charge: Charges set by Dia-NN @@ -275,7 +244,7 @@ def mztab_MTD(index_ref, dia_params, unimod_data, fasta, charge, missed_cleavage out_mztab_MTD.loc[1, "software[1]-setting[11]"] = "fixed_modifications:" + FixedModifications out_mztab_MTD.loc[1, "software[1]-setting[12]"] = "variable_modifications:" + VariableModifications - (fixed_mods, variable_mods, fix_flag, var_flag) = MTD_mod_info(unimod_data, FixedModifications, VariableModifications) + (fixed_mods, variable_mods, fix_flag, var_flag) = MTD_mod_info(FixedModifications, VariableModifications) if fix_flag == 1: for i in range(1, len(fixed_mods) + 1): out_mztab_MTD.loc[1, "fixed_mod[" + str(i) + "]"] = fixed_mods[i - 1][0] @@ -326,7 +295,7 @@ def mztab_MTD(index_ref, dia_params, unimod_data, fasta, charge, missed_cleavage return out_mztab_MTD_T, database -def mztab_PRH(report, pg, index_ref, database, fasta_pd): +def mztab_PRH(report, pg, index_ref, database, fasta_df): """Construct PRH sub-table. :param report: Dataframe for Dia-NN main report @@ -337,8 +306,8 @@ def mztab_PRH(report, pg, index_ref, database, fasta_pd): :type indx_ref: pandas.core.frame.DataFrame :param database: Path to fasta file :type database: str - :param fasta_pd: A dataframe contains protein IDs, sequences and lengths - :type fasta_pd: pandas.core.frame.DataFrame + :param fasta_df: A dataframe contains protein IDs, sequences and lengths + :type fasta_df: pandas.core.frame.DataFrame :return: PRH sub-table :rtype: pandas.core.frame.DataFrame """ @@ -370,7 +339,7 @@ def mztab_PRH(report, pg, index_ref, database, fasta_pd): prh_series.name = "accession" out_mztab_PRH = out_mztab_PRH.drop("accession", axis = 1).join(prh_series).reset_index().drop(columns = "index") out_mztab_PRH.loc[:, "protein_coverage"] = out_mztab_PRH.apply( - lambda x: calculate_protein_coverage(report, x["accession"], x["Protein.Ids"], fasta_pd), axis=1, result_type="expand") + lambda x: calculate_protein_coverage(report, x["accession"], x["Protein.Ids"], fasta_df), axis=1, result_type="expand") out_mztab_PRH.loc[:, "ambiguity_members"] = out_mztab_PRH.apply( lambda x: "null" if x["opt_global_result_type"] == "protein_details" else x["ambiguity_members"], axis=1) @@ -398,15 +367,13 @@ def mztab_PRH(report, pg, index_ref, database, fasta_pd): return out_mztab_PRH -def mztab_PEH(report, pr, unimod_data, precursor_list, index_ref, database): +def mztab_PEH(report, pr, precursor_list, index_ref, database, fasta_df): """Construct PEH sub-table. :param report: Dataframe for Dia-NN main report :type report: pandas.core.frame.DataFrame :param pr: Dataframe for Dia-NN precursors matrix :type pr: pandas.core.frame.DataFrame - :param unimod_data: An instance of class UnimodDatabase - :type unimod_data: sdrf_pipelines.openms.unimod.UnimodDatabase :param precursor_list: A list contains all precursor IDs :type precursor_list: list :param index_ref: On the basis of f_table, two columns "ms_run" and "study_variable" are added for matching @@ -431,9 +398,9 @@ def mztab_PEH(report, pr, unimod_data, precursor_list, index_ref, database): lambda x: find_modification(x["opt_global_cv_MS:1000889_peptidoform_sequence"]), axis=1, result_type="expand") out_mztab_PEH.loc[:, "opt_global_cv_MS:1000889_peptidoform_sequence"] = out_mztab_PEH.apply( - lambda x: convert_modification(x["opt_global_cv_MS:1000889_peptidoform_sequence"], unimod_data), axis=1) + lambda x: AASequence.fromString(x["opt_global_cv_MS:1000889_peptidoform_sequence"]).toString(), axis=1) - out_mztab_PEH.loc[:, "unique"] = out_mztab_PEH.apply(lambda x: "0" if ";" in x["accession"] else "1", axis=1, result_type="expand") + out_mztab_PEH.loc[:, "unique"] = out_mztab_PEH.apply(lambda x: Unique(x["accession"], x["sequence"], fasta_df), axis=1, result_type="expand") null_col = ["database_version", "search_engine", "retention_time_window", "mass_to_charge"] for i in null_col: @@ -470,11 +437,9 @@ def mztab_PEH(report, pr, unimod_data, precursor_list, index_ref, database): return out_mztab_PEH -def mztab_PSH(unimod_data, report, database): +def mztab_PSH(report, database, fasta_df): """Construct PSH sub-table. - :param unimod_data: An instance of class UnimodDatabase - :type unimod_data: sdrf_pipelines.openms.unimod.UnimodDatabase :param report: Dataframe for Dia-NN main report :type report: pandas.core.frame.DataFrame :param database: Path to fasta file @@ -495,7 +460,7 @@ def mztab_PSH(unimod_data, report, database): out_mztab_PSH.drop(labels = ["PSH"], axis = 1, inplace = True) out_mztab_PSH.insert(0, "PSH", index) out_mztab_PSH.loc[:, "PSM_ID"] = out_mztab_PSH.index - out_mztab_PSH.loc[:, "unique"] = out_mztab_PSH.apply(lambda x: "0" if ";" in x["accession"] else "1", axis=1, result_type="expand") + out_mztab_PSH.loc[:, "unique"] = out_mztab_PSH.apply(lambda x: Unique(x["accession"], x["sequence"], fasta_df), axis=1, result_type="expand") out_mztab_PSH.loc[:, "database"] = database null_col = ["database_version", "spectra_ref", "search_engine", "exp_mass_to_charge", "pre", "post", @@ -507,7 +472,7 @@ def mztab_PSH(unimod_data, report, database): lambda x: find_modification(x["opt_global_cv_MS:1000889_peptidoform_sequence"]), axis=1, result_type="expand") out_mztab_PSH.loc[:, "opt_global_cv_MS:1000889_peptidoform_sequence"] = out_mztab_PSH.apply( - lambda x: convert_modification(x["opt_global_cv_MS:1000889_peptidoform_sequence"], unimod_data), axis=1, result_type="expand") + lambda x: AASequence.fromString(x["opt_global_cv_MS:1000889_peptidoform_sequence"]).toString(), axis=1, result_type="expand") out_mztab_PSH = out_mztab_PSH.drop(["Genes"], axis = 1) out_mztab_PSH.fillna("null", inplace = True) @@ -519,8 +484,8 @@ def mztab_PSH(unimod_data, report, database): def add_info(target, index_ref): """On the basis of f_table, two columns "ms_run" and "study_variable" are added for matching. - :param target: The "Run" column in f_table - :type target: pandas.core.series.Series + :param target: The value of "Run" column in f_table + :type target: str :param index_ref: A dataframe on the basis of f_table :type indx_ref: pandas.core.frame.DataFrame :return: A tuple contains ms_run and study_variable @@ -550,22 +515,74 @@ def classify_result_type(target): return "single_protein" -def calculate_protein_coverage(report, target, reference, fasta_pd): +def Unique(accession, pep_seq, fasta_df): + '''Find the location of the peptide and determine if the peptide is unique + + :param target: The value of "accession" column in out_mztab_PEH or out_mztab_PSH + :type target: str + :param pep_seq: Peptide sequence + :type sep_seq: str + :param fasta_df: A dataframe contains protein IDs, sequences and lengths + :type fasta_df: pandas.core.frame.DataFrame + :return: Unique flag(1 or 0) + :rtype: str + ''' + unique = 1 + for i in accession.split(";"): + pro_seq = fasta_df[fasta_df["id"].str.contains(i)]["seq"].values[0] + result = re.finditer(pep_seq, pro_seq) + section_list = [] + if result: + for i in result: + section_list.append([i.span()[0],i.span()[1]-1]) + + if len(section_list) != 1: + unique = 0 + + return str(unique) + + +def calculate_protein_coverage(report, target, reference, fasta_df): """Calculate protein coverage. :param report: Dataframe for Dia-NN main report :type report: pandas.core.frame.DataFrame - :param target: The "accession" column in out_mztab_PRH - :type target: pandas.core.series.Series - :param target: The "Protein.Ids" column in report - :type target: pandas.core.series.Series - :param fasta_pd: A dataframe contains protein IDs, sequences and lengths - :type fasta_pd: pandas.core.frame.DataFrame + :param target: The value of "accession" column in out_mztab_PRH + :type target: str + :param fasta_df: A dataframe contains protein IDs, sequences and lengths + :type fasta_df: pandas.core.frame.DataFrame :return: Protein coverage :rtype: str """ - len_current = len(max(report[report["Protein.Ids"] == reference]["Stripped.Sequence"].values, key = len)) - len_original = fasta_pd[fasta_pd["id"].str.contains(target)]["len"].values[0] - protein_coverage = format(len_current / len_original, ".3f") + peptide_list = report[report["Protein.Ids"] == reference]["Stripped.Sequence"].drop_duplicates().values + unique_peptides = [j for i,j in enumerate(peptide_list) if all(j not in k for k in peptide_list[i+1:])] + resultlist = [] + ref = fasta_df[fasta_df["id"].str.contains(target)]["seq"].values[0] + + def findstr(basestr, s, resultlist): + result = re.finditer(s, basestr) + if result: + for i in result: + resultlist.append([i.span()[0],i.span()[1]-1]) + + return resultlist + + for i in unique_peptides: + resultlist = findstr(ref, i, resultlist) + # Sort and merge the interval list + resultlist.sort() + l, r = 0, 1 + while r < len(resultlist): + x1, y1 = resultlist[l][0], resultlist[l][1] + x2, y2 = resultlist[r][0], resultlist[r][1] + if x2 > y1: + l += 1 + r += 1 + else: + resultlist[l] = [x1, max(y1, y2)] + resultlist.pop(r) + + coverage_length = np.array([i[1] - i[0] + 1 for i in resultlist]).sum() + protein_coverage = format(coverage_length / len(ref), ".3f") return protein_coverage @@ -575,8 +592,8 @@ def match_in_report(report, target, max, flag, level): :param report: Dataframe for Dia-NN main report :type report: pandas.core.frame.DataFrame - :param target: The "pr_id" column in out_mztab_PEH(level="peptide") or the "accession" column in out_mztab_PRH(level="protein") - :type target: pandas.core.series.Series + :param target: The value of "pr_id" column in out_mztab_PEH(level="peptide") or the "accession" column in out_mztab_PRH(level="protein") + :type target: str :param max: max_assay or max_study_variable :type max: int :param flag: Match the "study_variable" column(flag=1) or the "ms_run" column(flag=0) in the filter result @@ -619,8 +636,8 @@ def PRH_match_report(report, target): :param report: Dataframe for Dia-NN main report :type report: pandas.core.frame.DataFrame - :param target: The "accession" column in report - :type target: pandas.core.series.Series + :param target: The value of "accession" column in report + :type target: str :return: A tuple contains multiple information to construct PRH sub-table :rtype: tuple """ @@ -637,8 +654,8 @@ def PEH_match_report(report, target): :param report: Dataframe for Dia-NN main report :type report: pandas.core.frame.DataFrame - :param target: The "pr_id" column in report - :type target: pandas.core.series.Series + :param target: The value of "pr_id" column in report + :type target: str :return: A tuple contains multiple information to construct PEH sub-table :rtype: tuple """ diff --git a/modules/local/diannconvert/main.nf b/modules/local/diannconvert/main.nf index fdcc0700..a4955140 100644 --- a/modules/local/diannconvert/main.nf +++ b/modules/local/diannconvert/main.nf @@ -2,11 +2,11 @@ process DIANNCONVERT { tag "$exp_design.Name" label 'process_low' - conda (params.enable_conda ? "conda-forge::pandas_schema bioconda::sdrf-pipelines=0.0.21" : null) + conda (params.enable_conda ? "conda-forge::pandas_schema bioconda::pyopenms=2.8.0-0" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/sdrf-pipelines:0.0.21--pyhdfd78af_0" + container "https://depot.galaxyproject.org/singularity/pyopenms:2.8.0--py36h24c8720_0" } else { - container "quay.io/biocontainers/sdrf-pipelines:0.0.21--pyhdfd78af_0" + container "quay.io/biocontainers/pyopenms:2.8.0--py36h24c8720_0" } input: @@ -14,11 +14,8 @@ process DIANNCONVERT { path(exp_design) path(report_pg) path(report_pr) - path(report_unique_gene) val(meta) path(fasta) - val(charge) - val(missed_cleavages) output: path "*msstats_in.csv", emit: out_msstats @@ -37,17 +34,16 @@ process DIANNCONVERT { --exp_design "${exp_design}" \\ --pg_matrix "${report_pg}" \\ --pr_matrix "${report_pr}" \\ - --unique_matrix "${report_unique_gene}" \\ --dia_params "${dia_params}" \\ --fasta "${fasta}" \\ - --charge ${charge} \\ - --missed_cleavages ${missed_cleavages} \\ - --qvalue_threshold ${params.protein_level_fdr_cutoff} \\ + --charge $params.max_precursor_charge \\ + --missed_cleavages $params.allowed_missed_cleavages \\ + --qvalue_threshold $params.protein_level_fdr_cutoff \\ |& tee convert_report.log cat <<-END_VERSIONS > versions.yml "${task.process}": - sdrf-pipelines: \$(echo "0.0.21") + pyopenms: \$(echo "2.8.0") END_VERSIONS """ } diff --git a/modules/local/diannconvert/meta.yml b/modules/local/diannconvert/meta.yml index c3ad33ea..f52123a2 100644 --- a/modules/local/diannconvert/meta.yml +++ b/modules/local/diannconvert/meta.yml @@ -1,10 +1,11 @@ name: DIANNCONVERT -description: A module to convert DIA report files to MSstats +description: A module to convert DIA report files to MSstats, Triqler and mzTab keywords: - DIA-NN - conversion - MSstats - Triqler + - mzTab tools: - custom: description: | @@ -20,6 +21,21 @@ input: type: file description: An experimental design file including Sample and replicates column et al. pattern: "*.tsv" + - report_pr: + type: file + description: A text table containing normalized quantities for precursors. They are filtered at 1% FDR, using both global and run-specific q-values for precursors + pattern: "*pr_matrix.tsv" + - report_pg: + type: file + description: A text table containing normalized quantities for protein groups. They are filtered at 1% FDR, using global q-values for protein groups + pattern: "*pg_matrix.tsv" + - meta: + type: map + description: Groovy Map containing sample information + - fasta: + type: file + description: Protein sequence database in Fasta format. + pattern: "*.{fasta,fa}" output: - out_msstats: type: file @@ -29,9 +45,14 @@ output: type: file description: Triqler input file pattern: "out_triqler.tsv" + - out_mztab: + type: file + description: mzTab + pattern: "*.mztab" - version: type: file description: File containing software version pattern: "versions.yml" authors: - "@daichengxin" + - "@wanghong" diff --git a/workflows/dia.nf b/workflows/dia.nf index a290d4f9..b4e2ed3e 100644 --- a/workflows/dia.nf +++ b/workflows/dia.nf @@ -85,8 +85,8 @@ workflow DIA { // // MODULE: DIANNCONVERT // - DIANNCONVERT(DIANNSUMMARY.out.main_report, ch_expdesign, DIANNSUMMARY.out.pg_matrix, DIANNSUMMARY.out.pr_matrix, DIANNSUMMARY.out.unique_gene_matrix, - ch_result.meta.first(), params.database, params.max_precursor_charge, params.allowed_missed_cleavages) + DIANNCONVERT(DIANNSUMMARY.out.main_report, ch_expdesign, DIANNSUMMARY.out.pg_matrix, DIANNSUMMARY.out.pr_matrix, + ch_result.meta.first(), params.database) ch_software_versions = ch_software_versions.mix(DIANNCONVERT.out.version.ifEmpty(null)) // From a82c4ccf251ec2eabad14d14ea9af740d4622908 Mon Sep 17 00:00:00 2001 From: Hong Wang <88552471+WangHong007@users.noreply.github.com> Date: Tue, 2 Aug 2022 20:06:00 +0800 Subject: [PATCH 44/47] Update main.nf --- modules/local/diannconvert/main.nf | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/modules/local/diannconvert/main.nf b/modules/local/diannconvert/main.nf index a4955140..7f3fb48f 100644 --- a/modules/local/diannconvert/main.nf +++ b/modules/local/diannconvert/main.nf @@ -2,12 +2,10 @@ process DIANNCONVERT { tag "$exp_design.Name" label 'process_low' - conda (params.enable_conda ? "conda-forge::pandas_schema bioconda::pyopenms=2.8.0-0" : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/pyopenms:2.8.0--py36h24c8720_0" - } else { - container "quay.io/biocontainers/pyopenms:2.8.0--py36h24c8720_0" - } + conda (params.enable_conda ? 'bioconda::multiqc=1.12' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/multiqc:1.12--pyhdfd78af_0' : + 'quay.io/biocontainers/multiqc:1.12--pyhdfd78af_0' }" input: path(report) From d3df9a440e0e022927c3c4ec4eb50af181b6a511 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 2 Aug 2022 15:23:51 +0100 Subject: [PATCH 45/47] Update modules.config --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 98572169..149fe473 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -176,7 +176,7 @@ process { // IDFILTER on PROTEIN LEVEL level = params.protein_quant == 'strictly_unique_peptides' ? 'prot' : 'protgroup' - decoys_present = params.quantify_decoys ? '' : '-remove_decoys' + decoys_present = params.quantify_decoys ? ' ' : '-remove_decoys' withName: '.*:TMT:PROTEININFERENCE:IDFILTER' { ext.args = [ "-score:${level} \"$params.protein_level_fdr_cutoff\"", From d4839c8d5a50fd2327d114278837105562841938 Mon Sep 17 00:00:00 2001 From: WangHong007 <88552471+WangHong007@users.noreply.github.com> Date: Wed, 3 Aug 2022 18:54:25 +0800 Subject: [PATCH 46/47] Fix --- bin/diann_convert.py | 47 +++++++++++++++--------------- modules/local/diannconvert/main.nf | 10 ++++--- 2 files changed, 30 insertions(+), 27 deletions(-) diff --git a/bin/diann_convert.py b/bin/diann_convert.py index fcb88717..27721683 100755 --- a/bin/diann_convert.py +++ b/bin/diann_convert.py @@ -322,27 +322,27 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df): out_mztab_PRH = pd.DataFrame() out_mztab_PRH = pg.drop(["Protein.Names"], axis = 1) out_mztab_PRH = out_mztab_PRH.rename(columns = {"Protein.Group":"accession", "First.Protein.Description":"description"}) - out_mztab_PRH.loc[:, "PRH"] = "PRT" - index = out_mztab_PRH.loc[:, "PRH"] - out_mztab_PRH.drop(labels = ["PRH"], axis = 1, inplace = True) - out_mztab_PRH.insert(0, "PRH", index) out_mztab_PRH.loc[:, "database"] = database null_col = ["taxid", "species", "database_version", "search_engine", "opt_global_Posterior_Probability_score", "opt_global_nr_found_peptides", "opt_global_cv_PRIDE:0000303_decoy_hit"] for i in null_col: out_mztab_PRH.loc[:, i] = "null" - - out_mztab_PRH.loc[:, "ambiguity_members"] = out_mztab_PRH.loc[:, "accession"] + out_mztab_PRH.loc[:, "accession"] = out_mztab_PRH.apply(lambda x: x["accession"].split(";")[0], axis = 1) - prh_series = out_mztab_PRH["accession"].str.split(";", expand = True).stack().reset_index(level = 1, drop = True) + protein_details_df = out_mztab_PRH[out_mztab_PRH["opt_global_result_type"] == "indistinguishable_protein_group"] + prh_series = protein_details_df["Protein.Ids"].str.split(";", expand = True).stack().reset_index(level = 1, drop = True) prh_series.name = "accession" - out_mztab_PRH = out_mztab_PRH.drop("accession", axis = 1).join(prh_series).reset_index().drop(columns = "index") + protein_details_df = protein_details_df.drop("accession", axis = 1).join(prh_series).reset_index().drop(columns = "index") + protein_details_df.loc[:, "opt_global_result_type"] = protein_details_df.apply(lambda x: "protein_details", axis = 1) + # protein_details_df = protein_details_df[-protein_details_df["accession"].str.contains("-")] + out_mztab_PRH = pd.concat([out_mztab_PRH, protein_details_df]).reset_index(drop = True) + out_mztab_PRH.loc[:, "protein_coverage"] = out_mztab_PRH.apply( lambda x: calculate_protein_coverage(report, x["accession"], x["Protein.Ids"], fasta_df), axis=1, result_type="expand") out_mztab_PRH.loc[:, "ambiguity_members"] = out_mztab_PRH.apply( - lambda x: "null" if x["opt_global_result_type"] == "protein_details" else x["ambiguity_members"], axis=1) + lambda x: x["Protein.Ids"] if x["opt_global_result_type"] == "indistinguishable_protein_group" else "null", axis=1) out_mztab_PRH[["modifiedSequence", "best_search_engine_score[1]"]] = out_mztab_PRH.apply( lambda x: PRH_match_report(report, x["accession"]), axis=1, result_type="expand") @@ -361,7 +361,11 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df): lambda x: match_in_report(report, x['accession'], max_study_variable, 1, 'protein'), axis=1, result_type='expand') out_mztab_PRH = out_mztab_PRH.drop(["Genes", "modifiedSequence", "Protein.Ids"], axis=1) - out_mztab_PRH.fillna("null", inplace=True) + out_mztab_PRH.fillna("null", inplace=True) + out_mztab_PRH.loc[:, "PRH"] = "PRT" + index = out_mztab_PRH.loc[:, "PRH"] + out_mztab_PRH.drop(labels = ["PRH"], axis = 1, inplace = True) + out_mztab_PRH.insert(0, "PRH", index) # out_mztab_PRH.to_csv("./out_protein.mztab", sep=",", index=False) return out_mztab_PRH @@ -388,11 +392,6 @@ def mztab_PEH(report, pr, precursor_list, index_ref, database, fasta_df): out_mztab_PEH = out_mztab_PEH.drop(["Protein.Group", "Protein.Names", "First.Protein.Description", "Proteotypic"], axis = 1) out_mztab_PEH = out_mztab_PEH.rename(columns = {"Stripped.Sequence":"sequence", "Protein.Ids":"accession", "Modified.Sequence":"opt_global_cv_MS:1000889_peptidoform_sequence", "Precursor.Charge":"charge"}) - out_mztab_PEH.loc[:, "PEH"] = "PEP" - index = out_mztab_PEH.loc[:, "PEH"] - out_mztab_PEH.drop(labels = ["PEH"], axis = 1, inplace = True) - out_mztab_PEH.insert(0, "PEH", index) - out_mztab_PEH.loc[:, "database"] = database out_mztab_PEH.loc[:, "modifications"] = out_mztab_PEH.apply( lambda x: find_modification(x["opt_global_cv_MS:1000889_peptidoform_sequence"]), axis=1, result_type="expand") @@ -432,6 +431,11 @@ def mztab_PEH(report, pr, precursor_list, index_ref, database, fasta_df): out_mztab_PEH[["opt_global_feature_id", "spectra_ref"]] = out_mztab_PEH.apply(lambda x:("null", "null"),axis = 1, result_type = "expand") out_mztab_PEH = out_mztab_PEH.drop(["Precursor.Id", "Genes", "pr_id"], axis = 1) out_mztab_PEH.fillna("null", inplace = True) + out_mztab_PEH.loc[:, "PEH"] = "PEP" + index = out_mztab_PEH.loc[:, "PEH"] + out_mztab_PEH.drop(labels = ["PEH"], axis = 1, inplace = True) + out_mztab_PEH.insert(0, "PEH", index) + out_mztab_PEH.loc[:, "database"] = database # out_mztab_PEH.to_csv("./out_peptide.mztab", sep=",", index=False) return out_mztab_PEH @@ -455,10 +459,6 @@ def mztab_PSH(report, database, fasta_df): "opt_global_cv_MS:1000889_peptidoform_sequence", "opt_global_SpecEValue_score", "opt_global_q-value", "opt_global_q-value_score"] out_mztab_PSH.loc[:, "opt_global_cv_MS:1002217_decoy_peptide"] = "0" - out_mztab_PSH.loc[:, "PSH"] = "PSM" - index = out_mztab_PSH.loc[:, "PSH"] - out_mztab_PSH.drop(labels = ["PSH"], axis = 1, inplace = True) - out_mztab_PSH.insert(0, "PSH", index) out_mztab_PSH.loc[:, "PSM_ID"] = out_mztab_PSH.index out_mztab_PSH.loc[:, "unique"] = out_mztab_PSH.apply(lambda x: Unique(x["accession"], x["sequence"], fasta_df), axis=1, result_type="expand") out_mztab_PSH.loc[:, "database"] = database @@ -476,6 +476,10 @@ def mztab_PSH(report, database, fasta_df): out_mztab_PSH = out_mztab_PSH.drop(["Genes"], axis = 1) out_mztab_PSH.fillna("null", inplace = True) + out_mztab_PSH.loc[:, "PSH"] = "PSM" + index = out_mztab_PSH.loc[:, "PSH"] + out_mztab_PSH.drop(labels = ["PSH"], axis = 1, inplace = True) + out_mztab_PSH.insert(0, "PSH", index) # out_mztab_PSH.to_csv("./out_psms.mztab", sep=",", index=False) return out_mztab_PSH @@ -507,10 +511,7 @@ def classify_result_type(target): :rtype: str """ if ";" in target["Protein.Ids"]: - if ";" in target["Protein.Group"]: - return "protein_details" - else: - return "indistinguishable_protein_group" + return "indistinguishable_protein_group" else: return "single_protein" diff --git a/modules/local/diannconvert/main.nf b/modules/local/diannconvert/main.nf index 7f3fb48f..c14aa623 100644 --- a/modules/local/diannconvert/main.nf +++ b/modules/local/diannconvert/main.nf @@ -2,10 +2,12 @@ process DIANNCONVERT { tag "$exp_design.Name" label 'process_low' - conda (params.enable_conda ? 'bioconda::multiqc=1.12' : null) - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.12--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.12--pyhdfd78af_0' }" + conda (params.enable_conda ? "conda-forge::pandas_schema bioconda::sdrf-pipelines=0.0.21" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/sdrf-pipelines:0.0.21--pyhdfd78af_0" + } else { + container "quay.io/biocontainers/sdrf-pipelines:0.0.21--pyhdfd78af_0" + } input: path(report) From 3e3a52411defa852045cc228c711b9feb7fc5aa2 Mon Sep 17 00:00:00 2001 From: WangHong007 <88552471+WangHong007@users.noreply.github.com> Date: Wed, 3 Aug 2022 19:50:41 +0800 Subject: [PATCH 47/47] Change container --- modules/local/diannconvert/main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/local/diannconvert/main.nf b/modules/local/diannconvert/main.nf index c14aa623..28491b7f 100644 --- a/modules/local/diannconvert/main.nf +++ b/modules/local/diannconvert/main.nf @@ -2,11 +2,11 @@ process DIANNCONVERT { tag "$exp_design.Name" label 'process_low' - conda (params.enable_conda ? "conda-forge::pandas_schema bioconda::sdrf-pipelines=0.0.21" : null) + conda (params.enable_conda ? "conda-forge::pandas_schema conda-forge::lzstring bioconda::pmultiqc=0.0.13" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/sdrf-pipelines:0.0.21--pyhdfd78af_0" + container "https://depot.galaxyproject.org/singularity/pmultiqc:0.0.13--pyhdfd78af_0" } else { - container "quay.io/biocontainers/sdrf-pipelines:0.0.21--pyhdfd78af_0" + container "quay.io/biocontainers/pmultiqc:0.0.13--pyhdfd78af_0" } input: