diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml index 7068c251..94fa05b2 100644 --- a/.github/workflows/branch.yml +++ b/.github/workflows/branch.yml @@ -13,7 +13,7 @@ jobs: - name: Check PRs if: github.repository == 'nf-core/quantms' run: | - "{ [[ ${{github.event.pull_request.head.repo.full_name }} == nf-core/quantms ]] && [[ $GITHUB_HEAD_REF = "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]]" + { [[ ${{github.event.pull_request.head.repo.full_name }} == nf-core/quantms ]] && [[ $GITHUB_HEAD_REF = "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] # If the above check failed, post a comment on the PR explaining the failure # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets @@ -42,4 +42,3 @@ jobs: Thanks again for your contribution! repo-token: ${{ secrets.GITHUB_TOKEN }} allow-repeats: false -# diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix-linting.yml new file mode 100644 index 00000000..8bf2def0 --- /dev/null +++ b/.github/workflows/fix-linting.yml @@ -0,0 +1,55 @@ +name: Fix linting from a comment +on: + issue_comment: + types: [created] + +jobs: + deploy: + # Only run if comment is on a PR with the main repo, and if it contains the magic keywords + if: > + contains(github.event.comment.html_url, '/pull/') && + contains(github.event.comment.body, '@nf-core-bot fix linting') && + github.repository == 'nf-core/quantms' + runs-on: ubuntu-latest + steps: + # Use the @nf-core-bot token to check out so we can push later + - uses: actions/checkout@v3 + with: + token: ${{ secrets.nf_core_bot_auth_token }} + + # Action runs on the issue comment, so we don't get the PR by default + # Use the gh cli to check out the PR + - name: Checkout Pull Request + run: gh pr checkout ${{ github.event.issue.number }} + env: + GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }} + + - uses: actions/setup-node@v2 + + - name: Install Prettier + run: npm install -g prettier @prettier/plugin-php + + # Check that we actually need to fix something + - name: Run 'prettier --check' + id: prettier_status + run: | + if prettier --check ${GITHUB_WORKSPACE}; then + echo "::set-output name=result::pass" + else + echo "::set-output name=result::fail" + fi + + - name: Run 'prettier --write' + if: steps.prettier_status.outputs.result == 'fail' + run: prettier --write ${GITHUB_WORKSPACE} + + - name: Commit & push changes + if: steps.prettier_status.outputs.result == 'fail' + run: | + git config user.email "core@nf-co.re" + git config user.name "nf-core-bot" + git config push.default upstream + git add . + git status + git commit -m "[automated] Fix linting with Prettier" + git push diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index e9cf5de3..77358dee 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -48,7 +48,7 @@ jobs: wget -qO- get.nextflow.io | bash sudo mv nextflow /usr/local/bin/ - - uses: actions/setup-python@v1 + - uses: actions/setup-python@v3 with: python-version: "3.6" architecture: "x64" @@ -78,5 +78,3 @@ jobs: lint_log.txt lint_results.md PR_number.txt - -# diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index 91c487a1..04758f61 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -26,4 +26,3 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} number: ${{ steps.pr_number.outputs.pr_number }} path: linting-logs/lint_results.md -# diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 00000000..d0e7ae58 --- /dev/null +++ b/.prettierignore @@ -0,0 +1,9 @@ +email_template.html +.nextflow* +work/ +data/ +results/ +.DS_Store +testing/ +testing* +*.pyc diff --git a/assets/email_template.html b/assets/email_template.html index 49215a6a..7a622e34 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -1,111 +1,53 @@ - - - - + + + + - - - nf-core/quantms Pipeline Report - - -
- + + nf-core/quantms Pipeline Report + + +
-

nf-core/quantms v${version}

-

Run Name: $runName

+ - <% if (!success){ out << """ -
-

nf-core/quantms execution completed unsuccessfully!

+

nf-core/quantms v${version}

+

Run Name: $runName

+ +<% if (!success){ + out << """ +
+

nf-core/quantms execution completed unsuccessfully!

The exit status of the task that caused the workflow execution to fail was: $exitStatus.

The full error message was:

-
${errorReport}
-
- """ } else { out << """ -
+
${errorReport}
+
+ """ +} else { + out << """ +
nf-core/quantms execution completed successfully! -
- """ } %> +
+ """ +} +%> -

The workflow was completed at $dateComplete (duration: $duration)

-

The command used to launch the workflow was as follows:

-
-$commandLine
+

The workflow was completed at $dateComplete (duration: $duration)

+

The command used to launch the workflow was as follows:

+
$commandLine
-

Pipeline Configuration:

- - - <% out << summary.collect{ k,v -> " - - - - - " }.join("\n") %> - -
- $k - -
$v
-
+

Pipeline Configuration:

+ + + <% out << summary.collect{ k,v -> "" }.join("\n") %> + +
$k
$v
-

nf-core/quantms

-

https://github.com/nf-core/quantms

-
- +

nf-core/quantms

+

https://github.com/nf-core/quantms

+ +
+ + diff --git a/bin/diann_convert.py b/bin/diann_convert.py index bf421092..b3f8a204 100755 --- a/bin/diann_convert.py +++ b/bin/diann_convert.py @@ -50,9 +50,9 @@ def convert(ctx, diann_report, exp_design): out_triqler.loc[:, "searchScore"] = 1 - report['PEP'] out_msstats = out_msstats[out_msstats["Intensity"] != 0] - out_msstats.to_csv('./out_msstats.csv', sep=',', index=False) + out_msstats.to_csv(os.path.splitext(os.path.basename(exp_design))[0] + '_out_msstats.csv', sep=',', index=False) out_triqler = out_triqler[out_triqler["intensity"] != 0] - out_triqler.to_csv('./out_triqler.tsv', sep='\t', index=False) + out_triqler.to_csv(os.path.splitext(os.path.basename(exp_design))[0] + '_out_triqler.tsv', sep='\t', index=False) def query_expdesign_value(reference, f_table, s_table): query_reference = f_table[f_table["run"] == reference] diff --git a/bin/msstats_plfq.R b/bin/msstats_plfq.R index 0253b22e..0203d58a 100755 --- a/bin/msstats_plfq.R +++ b/bin/msstats_plfq.R @@ -212,16 +212,21 @@ if (l == 1) { #write all comparisons into one CSV file write.table(test.MSstats$ComparisonResult, file=paste0(args[8],"_comparisons.csv"), quote=FALSE, sep='\t', row.names = FALSE) - groupComparisonPlots(data=test.MSstats$ComparisonResult, type="ComparisonPlot", - width=12, height=12,dot.size = 2) - - test.MSstats$Volcano <- test.MSstats$ComparisonResult[!is.na(test.MSstats$ComparisonResult$pvalue),] - groupComparisonPlots(data=test.MSstats$Volcano, type="VolcanoPlot", + valid_comp_data <- test.MSstats$ComparisonResult[!is.na(test.MSstats$ComparisonResult$pvalue), ] + if (nrow(valid_comp_data[!duplicated(valid_comp_data$Protein),]) < 2) { + warning("Warning: Not enough proteins with valid p-values for comparison. Skipping groupComparisonPlots step!") + } else { + groupComparisonPlots(data=test.MSstats$ComparisonResult, type="ComparisonPlot", width=12, height=12,dot.size = 2) - # Otherwise it fails since the behaviour is undefined - if (nrow(contrast_mat) > 1) { - groupComparisonPlots(data=test.MSstats$ComparisonResult, type="Heatmap", + groupComparisonPlots(data=valid_comp_data, type="VolcanoPlot", width=12, height=12,dot.size = 2) + + # Otherwise it fails since the behaviour is undefined + if (nrow(contrast_mat) > 1) { + groupComparisonPlots(data=test.MSstats$ComparisonResult, type="Heatmap", + width=12, height=12,dot.size = 2) + } } + } diff --git a/bin/msstats_tmt.R b/bin/msstats_tmt.R index 0ac644d8..20270083 100755 --- a/bin/msstats_tmt.R +++ b/bin/msstats_tmt.R @@ -210,11 +210,6 @@ if(typeof(reference_norm) == 'character'){ reference_norm <- char_to_boolean[reference_norm] } -if (length(args)<12) { - # outputPrefix - args[12] <- './msstatsiso' -} - csv_input <- args[1] contrast_str <- args[2] control_str <- args[3] @@ -246,5 +241,28 @@ if (l == 1) { test.MSstatsTMT <- groupComparisonTMT(contrast.matrix=contrast_mat, data=processed.quant) #TODO allow manual input (e.g. proteins of interest) - write.table(test.MSstatsTMT$ComparisonResult, file=paste0(args[12],"_comparisons.csv"), quote=FALSE, sep='\t', row.names = FALSE) + write.table(test.MSstatsTMT$ComparisonResult, file=paste0("msstatsiso_results.csv"), quote=FALSE, sep='\t', row.names = FALSE) + + valid_comp_data <- test.MSstatsTMT$ComparisonResult[!is.na(test.MSstatsTMT$ComparisonResult$pvalue), ] + + if (nrow(valid_comp_data[!duplicated(valid_comp_data$Protein),]) < 2) { + warning("Warning: Not enough proteins with valid p-values for comparison. Skipping groupComparisonPlots step!") + } else { + require(MSstats) + # BUG groupComparisonPlots function: re-run OpenMStoMSstatsTMTFormat + quant <- OpenMStoMSstatsTMTFormat(data, useUniquePeptide=useUniquePeptide, rmPSM_withfewMea_withinRun=rmPSM_withfewMea_withinRun, + rmProtein_with1Feature=rmProtein_with1Feature + ) + groupComparisonPlots(data=test.MSstatsTMT$ComparisonResult, type="ComparisonPlot", width=12, height=12, dot.size = 2) + + groupComparisonPlots(data=valid_comp_data, type="VolcanoPlot", + width=12, height=12, dot.size = 2) + + # Otherwise it fails since the behavior is undefined + if (nrow(contrast_mat) > 1) { + groupComparisonPlots(data=test.MSstatsTMT$ComparisonResult, type="Heatmap", + width=12, height=12, dot.size = 2) + } + } + } diff --git a/bin/prepare_diann_parameters.py b/bin/prepare_diann_parameters.py index 5abde49c..afbc3f76 100755 --- a/bin/prepare_diann_parameters.py +++ b/bin/prepare_diann_parameters.py @@ -22,9 +22,6 @@ def generate_cfg(ctx, enzyme, fix_mod, var_mod, precursor_tolerence, precursor_t cut = enzyme_cut(enzyme) unimod_database = UnimodDatabase() fix_ptm, var_ptm = convert_mod(unimod_database, fix_mod, var_mod) - mass_acc, mass_acc_ms1 = mass_tolerence(precursor_tolerence, precursor_tolerence_unit, fragment_tolerence, fragment_tolerence_unit) - mass_acc = " --mass-acc " + str(mass_acc) - mass_acc_ms1 = " --mass-acc-ms1 " + str(mass_acc_ms1) var_ptm_str = " --var-mod " fix_ptm_str = " --fixed-mod " @@ -36,11 +33,7 @@ def generate_cfg(ctx, enzyme, fix_mod, var_mod, precursor_tolerence, precursor_t diann_var_ptm += (var_ptm_str + mod) with open("diann_config.cfg", "w") as f: - f.write("--dir ./mzMLs --cut " + cut + diann_fix_ptm + diann_var_ptm + mass_acc + mass_acc_ms1 + - " --matrices --report-lib-info") - - with open("library_config.cfg", "w") as f: - f.write("--cut " + cut + diann_fix_ptm + diann_var_ptm + " --gen-spec-lib --smart-profiling") + f.write("--cut " + cut + diann_fix_ptm + diann_var_ptm) def convert_mod(unimod_database, fix_mod, var_mod): pattern = re.compile("\((.*?)\)") @@ -110,23 +103,6 @@ def enzyme_cut(enzyme): cut = "--cut" return cut -def mass_tolerence(prec, precursor_tolerence_unit, frag, fragment_tolerence_unit): - if precursor_tolerence_unit == "ppm": - ms1_tolerence = prec - else: - # Default 10 ppm - print("Warning: " + precursor_tolerence_unit + " unit not supported for DIA-NN. Default 10 ppm") - ms1_tolerence = 10 - - if fragment_tolerence_unit == "ppm": - ms2_tolerence = frag - else: - # Default 20 ppm - ms2_tolerence = 20 - print("Warning: " + fragment_tolerence_unit + " unit not supported for DIA-NN. Default 20 ppm") - - return ms1_tolerence, ms2_tolerence - cli.add_command(generate_cfg) if __name__ == "__main__": diff --git a/conf/modules.config b/conf/modules.config index aeb9d30e..51d21db3 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -235,6 +235,11 @@ process { // DIA-NN withName: '.*:DIA:.*' { ext.when = { !params.enable_conda } + publishDir = [ + path: { "${params.outdir}/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] } } diff --git a/conf/test_dia.config b/conf/test_dia.config index bd5eac56..b6654bc6 100644 --- a/conf/test_dia.config +++ b/conf/test_dia.config @@ -34,4 +34,12 @@ params { max_precursor_charge = 3 allowed_missed_cleavages = 1 diann_normalize = false + max_mods = 2 +} + +process { + // thermorawfileparser + withName: 'NFCORE_QUANTMS:QUANTMS:FILE_PREPARATION:THERMORAWFILEPARSER' { + publishDir = [path: { "${params.outdir}/${task.process.tokenize(':')[-1].toLowerCase()}" }, pattern: "*.log" ] + } } diff --git a/modules/local/assemble_empirical_library/main.nf b/modules/local/assemble_empirical_library/main.nf new file mode 100644 index 00000000..014390b6 --- /dev/null +++ b/modules/local/assemble_empirical_library/main.nf @@ -0,0 +1,49 @@ +process ASSEMBLE_EMPIRICAL_LIBRARY { + label 'process_low' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://containers.biocontainers.pro/s3/SingImgsRepo/diann/v1.8.1_cv1/diann_v1.8.1_cv1.img' : + 'biocontainers/diann:v1.8.1_cv1' }" + + input: + path(mzMLs) + path("quant/*") + path(lib) + path(diann_config) + + output: + path "empirical_library.tsv", emit: empirical_library + path "assemble_empirical_library.log", emit: log + path "versions.yml", emit: version + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + + mass_acc = params.mass_acc_automatic ? "--quick-mass-acc --individual-mass-acc" : "--mass-acc $params.mass_acc_ms2 --mass-acc-ms1 $params.mass_acc_ms1" + scan_window = params.scan_window_automatic ? "--individual-windows" : "--window $params.scan_window" + + """ + diann --f ${(mzMLs as List).join(' --f ')} \\ + --lib ${lib} \\ + --threads ${task.cpus} \\ + --out-lib empirical_library.tsv \\ + --verbose $params.diann_debug \\ + --rt-profiling \\ + --temp ./quant/ \\ + --use-quant \\ + ${mass_acc} \\ + ${scan_window} \\ + --gen-spec-lib \\ + $args \\ + |& tee assemble_empirical_library.log + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + DIA-NN: \$(diann 2>&1 | grep "DIA-NN" | grep -oP "(\\d*\\.\\d+\\.\\d+)|(\\d*\\.\\d+)") + END_VERSIONS + """ +} diff --git a/modules/local/diannsearch/meta.yml b/modules/local/assemble_empirical_library/meta.yml similarity index 50% rename from modules/local/diannsearch/meta.yml rename to modules/local/assemble_empirical_library/meta.yml index 3bb1c2b3..9dbe167e 100644 --- a/modules/local/diannsearch/meta.yml +++ b/modules/local/assemble_empirical_library/meta.yml @@ -1,8 +1,7 @@ -name: diannsearch -description: A module for DIA library free analysis based on DIA-NN. +name: assemble_empirical_library +description: A module for assembling an empirical library based on a preliminary analysis of the in-silico library with DIA-NN. keywords: - DIA-NN - - library free - DIA tools: - DIA-NN: @@ -11,34 +10,30 @@ tools: homepage: https://github.com/vdemichev/DiaNN documentation: https://github.com/vdemichev/DiaNN input: + - mzMLs: + type: file + description: Spectra file in mzML format + pattern: "*.mzML" + - quant: + type: file + description: The .quant files from DIA-NN preliminary analysis, containing IDs and quantification information. - lib: type: file description: Spectra library file pattern: "*.tsv" - - spectra: - type: dir - description: The directory for spectra files - - searchdb: - type: file - description: Fasta sequence file - pattern: "*.{fasta,fa}" - - cfg: + - diann_config: type: dir description: Specifies a file to load options/commands from. pattern: "*.cfg" output: - - report: - type: file - description: Main report file. A text table containing precursor and protein IDs, as well as plenty of associated information. Most column names are self-explanatory. - pattern: "diann_report.tsv" - - report_stat: + - empirical_library: type: file - description: Contains a number of QC metrics which can be used for data filtering, e.g. to exclude failed runs, or as a readout for method optimization. - pattern: "diann_report.stats.tsv" + description: An empirical spectral library from the .quant files. + pattern: "empirical_library.tsv" - log: type: file description: DIA-NN log file - pattern: "diann_report.log.txt" + pattern: "assemble_empirical_library.log" - version: type: file description: File containing software version diff --git a/modules/local/diann_preliminary_analysis/main.nf b/modules/local/diann_preliminary_analysis/main.nf new file mode 100644 index 00000000..ce505756 --- /dev/null +++ b/modules/local/diann_preliminary_analysis/main.nf @@ -0,0 +1,47 @@ +process DIANN_PRELIMINARY_ANALYSIS { + tag "$meta.id" + label 'process_high' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://containers.biocontainers.pro/s3/SingImgsRepo/diann/v1.8.1_cv1/diann_v1.8.1_cv1.img' : + 'biocontainers/diann:v1.8.1_cv1' }" + + input: + tuple val(meta), file(mzML), file(predict_tsv), file(diann_config) + + output: + path "*.quant", emit: diann_quant + tuple val(meta), path("*_diann.log"), emit: log + path "versions.yml", emit: version + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + + mass_acc = params.mass_acc_automatic ? "--quick-mass-acc --individual-mass-acc" : "--mass-acc $params.mass_acc_ms2 --mass-acc-ms1 $params.mass_acc_ms1" + scan_window = params.scan_window_automatic ? "--individual-windows" : "--window $params.scan_window" + time_corr_only = params.time_corr_only ? "--time-corr-only" : "" + + """ + diann --lib ${predict_tsv} \\ + --f ${mzML} \\ + --threads ${task.cpus} \\ + --verbose $params.diann_debug \\ + ${scan_window} \\ + --temp ./ \\ + --min-corr $params.min_corr \\ + --corr-diff $params.corr_diff \\ + ${mass_acc} \\ + ${time_corr_only} \\ + $args \\ + |& tee ${mzML.baseName}_diann.log + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + DIA-NN: \$(diann 2>&1 | grep "DIA-NN" | grep -oP "(\\d*\\.\\d+\\.\\d+)|(\\d*\\.\\d+)") + END_VERSIONS + """ +} diff --git a/modules/local/diann_preliminary_analysis/meta.yml b/modules/local/diann_preliminary_analysis/meta.yml new file mode 100644 index 00000000..989a0aed --- /dev/null +++ b/modules/local/diann_preliminary_analysis/meta.yml @@ -0,0 +1,42 @@ +name: diann_preliminary_analysis +description: A module for preliminary analysis of individual raw files with DIA-NN using the in-silico generated library (also from DIA-NN). +keywords: + - DIA-NN + - DIA +tools: + - DIA-NN: + description: | + DIA-NN - a universal software for data-independent acquisition (DIA) proteomics data processing by Demichev. + homepage: https://github.com/vdemichev/DiaNN + documentation: https://github.com/vdemichev/DiaNN +input: + - meta: + type: map + description: Groovy Map containing sample information + - predict_tsv: + type: file + description: Silico-predicted spectral library by deep leaning predictor in DIA-NN + pattern: "*.tsv" + - mzML: + type: file + description: Spectra file in mzML format + pattern: "*.mzML" + - cfg: + type: dir + description: Specifies a file to load options/commands from. + pattern: "*.cfg" +output: + - diann_quant: + type: file + description: Quantification file from DIA-NN + pattern: "*.quant" + - log: + type: file + description: DIA-NN log file + pattern: "*_diann.log" + - version: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@daichengxin" diff --git a/modules/local/diannconvert/main.nf b/modules/local/diannconvert/main.nf index f21461e9..ae96c2a9 100644 --- a/modules/local/diannconvert/main.nf +++ b/modules/local/diannconvert/main.nf @@ -1,4 +1,5 @@ process DIANNCONVERT { + tag "$exp_design.Name" label 'process_low' conda (params.enable_conda ? "conda-forge::pandas_schema bioconda::sdrf-pipelines=0.0.21" : null) @@ -13,8 +14,8 @@ process DIANNCONVERT { path(exp_design) output: - path "out_msstats.csv", emit: out_msstats - path "out_triqler.tsv", emit: out_triqler + path "*out_msstats.csv", emit: out_msstats + path "*out_triqler.tsv", emit: out_triqler path "versions.yml", emit: version script: diff --git a/modules/local/diannsearch/main.nf b/modules/local/diannsearch/main.nf deleted file mode 100644 index 7bbc56ef..00000000 --- a/modules/local/diannsearch/main.nf +++ /dev/null @@ -1,64 +0,0 @@ -process DIANNSEARCH { - label 'process_high' - - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://containers.biocontainers.pro/s3/SingImgsRepo/diann/v1.8.1_cv1/diann_v1.8.1_cv1.img' : - 'biocontainers/diann:v1.8.1_cv1' }" - - input: - file 'mzMLs/*' - file(lib_tsv) - file(searchdb) - file(diann_config) - - output: - path "diann_report.tsv", emit: report - path "diann_report.stats.tsv", emit: report_stat - path "diann_report.log.txt", emit: log - path "versions.yml", emit: version - path "*.tsv" - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - mbr = params.targeted_only ? "" : "--reanalyse" - normalize = params.diann_normalize ? "" : "--no-norm" - - min_pr_mz = params.min_pr_mz ? "--min-pr-mz params.min_pr_mz":"" - max_pr_mz = params.max_pr_mz ? "--max-pr-mz params.max_pr_mz":"" - min_fr_mz = params.min_fr_mz ? "--min_fr_mz params.min_fr_mz":"" - max_fr_mz = params.max_fr_mz ? "--max_fr_mz params.max_fr_mz":"" - - """ - diann `cat diann_config.cfg` \\ - --lib ${(lib_tsv as List).join('--lib ')} \\ - --relaxed-prot-inf \\ - --fasta ${searchdb} \\ - ${min_pr_mz} \\ - ${max_pr_mz} \\ - ${min_fr_mz} \\ - ${max_fr_mz} \\ - --threads ${task.cpus} \\ - --missed-cleavages $params.allowed_missed_cleavages \\ - --min-pep-len $params.min_peptide_length \\ - --max-pep-len $params.max_peptide_length \\ - --min-pr-charge $params.min_precursor_charge \\ - --max-pr-charge $params.max_precursor_charge \\ - --var-mods $params.max_mods \\ - --matrix-spec-q $params.matrix_spec_q \\ - ${mbr} \\ - --reannotate \\ - ${normalize} \\ - --out diann_report.tsv \\ - --verbose $params.diann_debug \\ - |& tee diann.log - - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - DIA-NN: \$(diann 2>&1 | grep "DIA-NN" | grep -oP "(\\d*\\.\\d+\\.\\d+)|(\\d*\\.\\d+)") - END_VERSIONS - """ -} diff --git a/modules/local/diannsummary/main.nf b/modules/local/diannsummary/main.nf new file mode 100644 index 00000000..fe0015ca --- /dev/null +++ b/modules/local/diannsummary/main.nf @@ -0,0 +1,53 @@ +process DIANNSUMMARY { + label 'process_high' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://containers.biocontainers.pro/s3/SingImgsRepo/diann/v1.8.1_cv1/diann_v1.8.1_cv1.img' : + 'biocontainers/diann:v1.8.1_cv1' }" + + input: + file(mzMLs) + file(empirical_library) + file("quant/") + file(fasta) + + output: + path "diann_report.tsv", emit: report + path "diannsummary.log", emit: log + path "versions.yml", emit: version + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + + mass_acc = params.mass_acc_automatic ? "--quick-mass-acc --individual-mass-acc" : "--mass-acc $params.mass_acc_ms2 --mass-acc-ms1 $params.mass_acc_ms1" + scan_window = params.scan_window_automatic ? "--individual-windows" : "--window $params.scan_window" + species_genes = params.species_genes ? "--species-genes": "" + + """ + diann --lib ${empirical_library} \\ + --fasta ${fasta} \\ + --f ${(mzMLs as List).join(' --f ')} \\ + --threads ${task.cpus} \\ + --verbose $params.diann_debug \\ + ${scan_window} \\ + ${mass_acc} \\ + --temp ./quant/ \\ + --relaxed-prot-inf \\ + --pg-level $params.pg_level \\ + ${species_genes} \\ + --use-quant \\ + --matrices \\ + --out diann_report.tsv \\ + $args \\ + |& tee diannsummary.log + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + DIA-NN: \$(diann 2>&1 | grep "DIA-NN" | grep -oP "(\\d*\\.\\d+\\.\\d+)|(\\d*\\.\\d+)") + END_VERSIONS + """ +} diff --git a/modules/local/diannsummary/meta.yml b/modules/local/diannsummary/meta.yml new file mode 100644 index 00000000..0d06c7ac --- /dev/null +++ b/modules/local/diannsummary/meta.yml @@ -0,0 +1,47 @@ +name: diannsummary +description: A module for summarization of results from DIA-NN analysis. +keywords: + - DIA-NN + - DIA +tools: + - DIA-NN: + description: | + DIA-NN - a universal software for data-independent acquisition (DIA) proteomics data processing by Demichev. + homepage: https://github.com/vdemichev/DiaNN + documentation: https://github.com/vdemichev/DiaNN +input: + - empirical_library: + type: file + description: Empirical spectral library generated by DIA-NN + pattern: "*.tsv" + - mzMLs: + type: file + description: Spectra files in mzML format. + pattern: "*.mzML" + - fasta: + type: file + description: Protein sequence database in Fasta format. + pattern: "*.{fasta,fa}" + - quant: + type: file + description: Identification and Quantification file from DIA-NN. + pattern: "*.quant" + - cfg: + type: dir + description: Specifies a file to load options/commands. + pattern: "*.cfg" +output: + - report: + type: file + description: A text table containing precursor and protein IDs, as well as plenty of associated information. + pattern: "*.tsv" + - log: + type: file + description: DIA-NN log file + pattern: "*_diann.log" + - version: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@daichengxin" diff --git a/modules/local/generate_diann_cfg/main.nf b/modules/local/generate_diann_cfg/main.nf index 54e34539..26d44186 100644 --- a/modules/local/generate_diann_cfg/main.nf +++ b/modules/local/generate_diann_cfg/main.nf @@ -12,8 +12,7 @@ process GENERATE_DIANN_CFG { val(meta) output: - path "diann_config.cfg", emit: search_cfg - path "library_config.cfg", emit: library_config + path "diann_config.cfg", emit: diann_cfg path "versions.yml", emit: version path "*.log" diff --git a/modules/local/generate_diann_cfg/meta.yml b/modules/local/generate_diann_cfg/meta.yml index 55368414..4d3fbfd1 100644 --- a/modules/local/generate_diann_cfg/meta.yml +++ b/modules/local/generate_diann_cfg/meta.yml @@ -14,11 +14,7 @@ input: type: map description: Groovy Map containing sample information output: - - library_config: - type: file - description: DIA-NN configure file for library generation - pattern: "library_config.cfg" - - search_cfg: + - diann_cfg: type: file description: DIA-NN configure file for search and quantification pattern: "diann_config.cfg" diff --git a/modules/local/individual_final_analysis/main.nf b/modules/local/individual_final_analysis/main.nf new file mode 100644 index 00000000..53ca6f25 --- /dev/null +++ b/modules/local/individual_final_analysis/main.nf @@ -0,0 +1,53 @@ +process INDIVIDUAL_FINAL_ANALYSIS { + tag "$mzML.baseName" + label 'process_high' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://containers.biocontainers.pro/s3/SingImgsRepo/diann/v1.8.1_cv1/diann_v1.8.1_cv1.img' : + 'biocontainers/diann:v1.8.1_cv1' }" + + input: + tuple file(mzML), file(diann_log), file(library) + + output: + path "*.quant", emit: diann_quant + path "*_final_diann.log", emit: log + path "versions.yml", emit: version + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + + mass_acc = params.mass_acc_ms2 + scan_window = params.scan_window + ms1_accuracy = params.mass_acc_ms1 + + if (params.mass_acc_automatic | params.scan_window_automatic){ + mass_acc = "\$(cat ${diann_log} | grep \"Averaged recommended settings\" | cut -d ' ' -f 11 | tr -cd \"[0-9]\")" + scan_window = "\$(cat ${diann_log} | grep \"Averaged recommended settings\" | cut -d ' ' -f 19 | tr -cd \"[0-9]\")" + ms1_accuracy = "\$(cat ${diann_log} | grep \"Averaged recommended settings\" | cut -d ' ' -f 15 | tr -cd \"[0-9]\")" + } + + """ + diann --lib ${library} \\ + --f ${mzML} \\ + --threads ${task.cpus} \\ + --verbose $params.diann_debug \\ + --temp ./ \\ + --mass-acc \$(echo ${mass_acc}) \\ + --mass-acc-ms1 \$(echo ${ms1_accuracy}) \\ + --window \$(echo ${scan_window}) \\ + --no-ifs-removal \\ + --no-main-report \\ + --no-prot-inf \\ + $args \\ + |& tee ${mzML.baseName}_final_diann.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + DIA-NN: \$(diann 2>&1 | grep "DIA-NN" | grep -oP "(\\d*\\.\\d+\\.\\d+)|(\\d*\\.\\d+)") + END_VERSIONS + """ +} diff --git a/modules/local/librarygeneration/meta.yml b/modules/local/individual_final_analysis/meta.yml similarity index 52% rename from modules/local/librarygeneration/meta.yml rename to modules/local/individual_final_analysis/meta.yml index 64d5aee7..755d7ab7 100644 --- a/modules/local/librarygeneration/meta.yml +++ b/modules/local/individual_final_analysis/meta.yml @@ -1,8 +1,7 @@ -name: librarygeneration -description: A module for library generation based on DIA-NN. +name: INDIVIDUAL_FINAL_ANALYSIS +description: A module for final analysis of individual raw files based on DIA-NN. keywords: - DIA-NN - - library free - DIA tools: - DIA-NN: @@ -11,35 +10,38 @@ tools: homepage: https://github.com/vdemichev/DiaNN documentation: https://github.com/vdemichev/DiaNN input: - - spectra: + - meta: + type: map + description: Groovy Map containing sample information + - diann_log: type: file - description: Spectra file - pattern: "*.mzML" - - fasta: + description: DIA-NN log file + pattern: "*.log" + - library: type: file - description: FASTA sequence databases - pattern: "*.{fasta,fa}" - - cfg: + description: Silico-predicted spectral library by deep leaning predictor in DIA-NN + pattern: "*.tsv" + - mzML: type: file - description: specifies a configuration file to load options/commands from. + description: Spectra file in mzML format + pattern: "*.mzML" + - diann_config: + type: dir + description: Specifies a file to load options/commands from. pattern: "*.cfg" output: - - lib_splib: - type: file - description: Spectra library file. - pattern: "*_lib.tsv" - - speclib: + - diann_quant: type: file - description: Spectral library file based on speclib format - pattern: "*.tsv.speclib" - - predict_speclib: + description: Quantification file from DIA-NN + pattern: "*.quant" + - lib: type: file - description: Silico-predicted spectral library by deep leaning predictor in DIA-NN - pattern: "*.predicted.speclib" + description: Spectral library file + pattern: "*.tsv" - log: type: file description: DIA-NN log file - pattern: "report.log.txt" + pattern: "*_diann.log" - version: type: file description: File containing software version diff --git a/modules/local/msstats/main.nf b/modules/local/msstats/main.nf index b1c587bf..a3373a50 100644 --- a/modules/local/msstats/main.nf +++ b/modules/local/msstats/main.nf @@ -35,8 +35,7 @@ process MSSTATS { ${params.msstatslfq_quant_summary_method} \\ ${msstats_csv_input.baseName} \\ $args \\ - > msstats.log \\ - || echo "Optional MSstats step failed. Please check logs and re-run or do a manual statistical analysis." + |& tee msstats.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/msstatstmt/main.nf b/modules/local/msstatstmt/main.nf index c7e28014..a0f518f9 100644 --- a/modules/local/msstatstmt/main.nf +++ b/modules/local/msstatstmt/main.nf @@ -39,13 +39,13 @@ process MSSTATSTMT { ${params.msstatsiso_reference_normalization} \\ ${msstatstmt_csv_input.baseName} \\ $args \\ - > msstats_tmt.log \\ - || echo "Optional MSstatsTMT step failed. Please check logs and re-run or do a manual statistical analysis." + |& tee msstats_tmt.log cat <<-END_VERSIONS > versions.yml "${task.process}": - MSstatsTMT: \$(echo "2.2.0") + r-base: \$(echo \$(R --version 2>&1) | sed 's/^.*R version //; s/ .*\$//') + bioconductor-msstatstmt: \$(Rscript -e "library(MSstatsTMT); cat(as.character(packageVersion('MSstatsTMT')))") END_VERSIONS """ } diff --git a/modules/local/librarygeneration/main.nf b/modules/local/silicolibrarygeneration/main.nf similarity index 75% rename from modules/local/librarygeneration/main.nf rename to modules/local/silicolibrarygeneration/main.nf index 6191a5ff..0570b998 100644 --- a/modules/local/librarygeneration/main.nf +++ b/modules/local/silicolibrarygeneration/main.nf @@ -1,20 +1,19 @@ -process LIBRARYGENERATION { - label 'process_high' +process SILICOLIBRARYGENERATION { + tag "$fasta.Name" + label 'process_medium' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://containers.biocontainers.pro/s3/SingImgsRepo/diann/v1.8.1_cv1/diann_v1.8.1_cv1.img' : 'biocontainers/diann:v1.8.1_cv1' }" input: - tuple file(mzml), file(fasta) - file(library_config) + file(fasta) + file(diann_config) output: - path "*_lib.tsv", emit: lib_splib path "versions.yml", emit: version - path "report.log.txt", emit: log - path "*.tsv.speclib", emit: speclib path "*.predicted.speclib", emit: predict_speclib + path "diann.log", emit: log when: task.ext.when == null || task.ext.when @@ -24,15 +23,13 @@ process LIBRARYGENERATION { min_pr_mz = params.min_pr_mz ? "--min-pr-mz $params.min_pr_mz":"" max_pr_mz = params.max_pr_mz ? "--max-pr-mz $params.max_pr_mz":"" - min_fr_mz = params.min_fr_mz ? "--min_fr_mz $params.min_fr_mz":"" - max_fr_mz = params.max_fr_mz ? "--max_fr_mz $params.max_fr_mz":"" + min_fr_mz = params.min_fr_mz ? "--min-fr-mz $params.min_fr_mz":"" + max_fr_mz = params.max_fr_mz ? "--max-fr-mz $params.max_fr_mz":"" """ - diann `cat library_config.cfg` \\ + diann `cat diann_config.cfg` \\ --fasta ${fasta} \\ --fasta-search \\ - --f ${mzml} \\ - --out-lib ${mzml.baseName}_lib.tsv \\ ${min_pr_mz} \\ ${max_pr_mz} \\ ${min_fr_mz} \\ @@ -46,6 +43,7 @@ process LIBRARYGENERATION { --threads ${task.cpus} \\ --predictor \\ --verbose $params.diann_debug \\ + --gen-spec-lib \\ |& tee diann.log diff --git a/nextflow.config b/nextflow.config index 8e1d26b3..b01bb619 100644 --- a/nextflow.config +++ b/nextflow.config @@ -145,8 +145,19 @@ params { quantify_decoys = false // DIA-NN - matrix_spec_q = 0.01 - diann_debug = 3 + diann_debug = 3 + scan_window = 8 + scan_window_automatic = true + min_corr = 2.0 + corr_diff = 1.0 + time_corr_only = true + mass_acc_automatic = true + mass_acc_ms2 = 13 + mass_acc_ms1 = 7 + pg_level = 2 + species_genes = false + + // TODO think about unifying it with DDA parameters min_pr_mz = null max_pr_mz = null @@ -321,7 +332,7 @@ trace { } dag { enabled = true - file = "${params.tracedir}/pipeline_dag_${trace_timestamp}.svg" + file = "${params.tracedir}/pipeline_dag_${trace_timestamp}.html" } manifest { diff --git a/nextflow_schema.json b/nextflow_schema.json index ee1b6ed7..7ca72984 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -812,12 +812,55 @@ "enum": ["dda", "dia"], "fa_icon": "far fa-list-ol" }, - "matrix_spec_q": { - "type": "number", - "description": "run-specific protein q-value filtering will be used, in addition to the global q-value filtering, when saving protein matrices. The ability to filter based on run-specific protein q-values, which allows to generate highly reliable data, is one of the advantages of DIA-NN", - "default": 0.01, + "mass_acc_automatic": { + "type": "boolean", + "default": true, + "description": "Choosing the MS2 mass accuracy setting automatically", + "fa_icon": "fas fa-toggle-on" + }, + "mass_acc_ms2": { + "type": "integer", + "description": "Sets the MS2 mass accuracy to N ppm", + "default": 11, "fa_icon": "fas fa-filter" }, + "mass_acc_ms1": { + "type": "integer", + "description": "Sets the MS1 mass accuracy to N ppm", + "default": 15, + "fa_icon": "fas fa-filter" + }, + "scan_window_automatic": { + "type": "boolean", + "description": "Choosing scan_window setting automatically", + "default": true, + "fa_icon": "fas fa-toggle-on" + }, + "scan_window": { + "type": "integer", + "description": "Set the scan window radius to a specific value", + "fa_icon": "fas fa-filter", + "help_text": " Ideally, should be approximately equal to the average number of data points per peak", + "default": 7 + }, + "min_corr": { + "type": "number", + "description": "Only peaks with correlation sum exceeding min_corr will be considered", + "fa_icon": "fas fa-filter", + "default": 2.0 + }, + "corr_diff": { + "type": "number", + "description": "Peaks with correlation sum below corr_diff from maximum will not be considered", + "fa_icon": "fas fa-filter", + "default": 1.0 + }, + "time_corr_only": { + "type": "boolean", + "description": "A single score will be used until RT alignment to save memory", + "fa_icon": "fas fa-filter", + "default": true + }, "min_pr_mz": { "type": "number", "description": "The minimum precursor m/z for the in silico library generation or library-free search", @@ -838,6 +881,19 @@ "description": "The maximum fragment m/z for the in silico library generation or library-free search", "fa_icon": "fas fa-filter" }, + "pg_level": { + "type": "number", + "description": "Controls the protein inference mode", + "fa_icon": "fas fa-list-ol", + "enum": [0, 1, 2], + "default": 2 + }, + "species_genes": { + "type": "boolean", + "description": "Instructs DIA-NN to add the organism identifier to the gene names", + "fa_icon": "far fa-check-square", + "default": false + }, "diann_debug": { "type": "integer", "description": "Debug level", diff --git a/workflows/dia.nf b/workflows/dia.nf index c5fa3a53..fd066cc0 100644 --- a/workflows/dia.nf +++ b/workflows/dia.nf @@ -7,11 +7,14 @@ // // MODULES: Local to the pipeline // -include { DIANNSEARCH } from '../modules/local/diannsearch/main' include { GENERATE_DIANN_CFG as DIANNCFG } from '../modules/local/generate_diann_cfg/main' include { DIANNCONVERT } from '../modules/local/diannconvert/main' -include { LIBRARYGENERATION } from '../modules/local/librarygeneration/main' include { MSSTATS } from '../modules/local/msstats/main' +include { DIANN_PRELIMINARY_ANALYSIS } from '../modules/local/diann_preliminary_analysis/main' +include { ASSEMBLE_EMPIRICAL_LIBRARY } from '../modules/local/assemble_empirical_library/main' +include { SILICOLIBRARYGENERATION } from '../modules/local/silicolibrarygeneration/main' +include { INDIVIDUAL_FINAL_ANALYSIS } from '../modules/local/individual_final_analysis/main' +include { DIANNSUMMARY } from '../modules/local/diannsummary/main' // // SUBWORKFLOWS: Consisting of a mix of local and nf-core/modules @@ -43,16 +46,48 @@ workflow DIA { } .set { result } - DIANNCFG(result.meta) + DIANNCFG(result.meta.first()) ch_software_versions = ch_software_versions.mix(DIANNCFG.out.version.ifEmpty(null)) - LIBRARYGENERATION(result.mzml.combine(searchdb), DIANNCFG.out.library_config) + // + // MODULE: SILICOLIBRARYGENERATION + // + SILICOLIBRARYGENERATION(searchdb, DIANNCFG.out.diann_cfg) - DIANNSEARCH(result.mzml.collect(), LIBRARYGENERATION.out.lib_splib.collect(), searchdb, DIANNCFG.out.search_cfg.distinct()) - ch_software_versions = ch_software_versions.mix(DIANNSEARCH.out.version.ifEmpty(null)) + // + // MODULE: DIANN_PRELIMINARY_ANALYSIS + // + DIANN_PRELIMINARY_ANALYSIS(file_preparation_results.combine(SILICOLIBRARYGENERATION.out.predict_speclib).combine(DIANNCFG.out.diann_cfg)) + ch_software_versions = ch_software_versions.mix(DIANN_PRELIMINARY_ANALYSIS.out.version.ifEmpty(null)) - DIANNCONVERT(DIANNSEARCH.out.report, ch_expdesign) - versions = ch_software_versions + // + // MODULE: ASSEMBLE_EMPIRICAL_LIBRARY + // + ASSEMBLE_EMPIRICAL_LIBRARY(result.mzml.collect(), + DIANN_PRELIMINARY_ANALYSIS.out.diann_quant.collect(), + SILICOLIBRARYGENERATION.out.predict_speclib, + DIANNCFG.out.diann_cfg + ) + ch_software_versions = ch_software_versions.mix(ASSEMBLE_EMPIRICAL_LIBRARY.out.version.ifEmpty(null)) + + // + // MODULE: INDIVIDUAL_FINAL_ANALYSIS + // + INDIVIDUAL_FINAL_ANALYSIS(result.mzml.combine(ASSEMBLE_EMPIRICAL_LIBRARY.out.log).combine(ASSEMBLE_EMPIRICAL_LIBRARY.out.empirical_library)) + ch_software_versions = ch_software_versions.mix(INDIVIDUAL_FINAL_ANALYSIS.out.version.ifEmpty(null)) + + // + // MODULE: DIANNSUMMARY + // + DIANNSUMMARY(result.mzml.collect(), ASSEMBLE_EMPIRICAL_LIBRARY.out.empirical_library, + INDIVIDUAL_FINAL_ANALYSIS.out.diann_quant.collect(), searchdb) + ch_software_versions = ch_software_versions.mix(DIANNSUMMARY.out.version.ifEmpty(null)) + + // + // MODULE: DIANNCONVERT + // + DIANNCONVERT(DIANNSUMMARY.out.report, ch_expdesign) + ch_software_versions = ch_software_versions.mix(DIANNCONVERT.out.version.ifEmpty(null)) // // MODULE: MSSTATS @@ -63,9 +98,11 @@ workflow DIA { ch_software_versions = ch_software_versions.mix(MSSTATS.out.version.ifEmpty(null)) } + versions = ch_software_versions + emit: versions = versions - diann_report = DIANNSEARCH.out.report + diann_report = DIANNSUMMARY.out.report msstats_csv = DIANNCONVERT.out.out_msstats out_triqler = DIANNCONVERT.out.out_triqler msstats_out = ch_msstats_out