diff --git a/.gitignore b/.gitignore index 99acd2c..10bcc4e 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,4 @@ null/ .cursor/rules/codacy.mdc .codacy/ .github/instructions/codacy.instructions.md +docs/superpowers/ diff --git a/conf/diann_versions/v1_8_1.config b/conf/diann_versions/v1_8_1.config index 2821ee2..5bfb7ef 100644 --- a/conf/diann_versions/v1_8_1.config +++ b/conf/diann_versions/v1_8_1.config @@ -2,6 +2,8 @@ * DIA-NN 1.8.1 container override (public biocontainers) * Used by merge_ci.yml for version × feature matrix testing. */ +params.diann_version = '1.8.1' + process { withLabel: diann { container = 'docker.io/biocontainers/diann:v1.8.1_cv1' diff --git a/conf/diann_versions/v2_1_0.config b/conf/diann_versions/v2_1_0.config index bedfe95..9915726 100644 --- a/conf/diann_versions/v2_1_0.config +++ b/conf/diann_versions/v2_1_0.config @@ -2,6 +2,8 @@ * DIA-NN 2.1.0 container override (private ghcr.io) * Used by merge_ci.yml for version × feature matrix testing. */ +params.diann_version = '2.1.0' + process { withLabel: diann { container = 'ghcr.io/bigbio/diann:2.1.0' diff --git a/conf/diann_versions/v2_2_0.config b/conf/diann_versions/v2_2_0.config index 1e79ea3..93ea4ee 100644 --- a/conf/diann_versions/v2_2_0.config +++ b/conf/diann_versions/v2_2_0.config @@ -2,6 +2,8 @@ * DIA-NN 2.2.0 container override (private ghcr.io) * Used by merge_ci.yml for version × feature matrix testing. */ +params.diann_version = '2.2.0' + process { withLabel: diann { container = 'ghcr.io/bigbio/diann:2.2.0' diff --git a/conf/tests/test_dia_local.config b/conf/tests/test_dia_local.config new file mode 100644 index 0000000..8d523c5 --- /dev/null +++ b/conf/tests/test_dia_local.config @@ -0,0 +1,18 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Local container overrides for testing with dev builds of sdrf-pipelines and quantms-utils. + Uses docker.io/ prefix to prevent quay.io registry from being prepended. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +process { + withName: 'SDRF_PARSING' { + container = 'docker.io/local/sdrf-pipelines:dev' + } + withName: 'SAMPLESHEET_CHECK' { + container = 'docker.io/local/quantms-utils:dev' + } + withName: 'DIANN_MSSTATS' { + container = 'docker.io/local/quantms-utils:dev' + } +} diff --git a/docs/usage.md b/docs/usage.md index 4464cac..a9b652a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -88,6 +88,95 @@ nextflow run . -profile test_dia_dotd,docker --outdir results nextflow run . -profile test_latest_dia,docker --outdir results ``` +## DIA-NN parameters + +The pipeline passes parameters to DIA-NN at different steps. Some parameters come from the SDRF metadata (per-file), some from `nextflow.config` defaults, and some from the command line. The table below documents each parameter, its source, and which pipeline steps use it. + +### Parameter sources + +Parameters are resolved in this priority order: + +1. **SDRF metadata** (per-file, from `convert-diann` design file) — highest priority +2. **Pipeline parameters** (`--param_name` on command line or params file) +3. **Nextflow defaults** (`nextflow.config`) — lowest priority + +### Pipeline steps + +| Step | Description | +| ------------------------------- | ------------------------------------------------------------------- | +| **INSILICO_LIBRARY_GENERATION** | Predicts a spectral library from FASTA using DIA-NN's deep learning | +| **PRELIMINARY_ANALYSIS** | Per-file calibration and mass accuracy estimation (first pass) | +| **ASSEMBLE_EMPIRICAL_LIBRARY** | Builds consensus empirical library from preliminary results | +| **INDIVIDUAL_ANALYSIS** | Per-file quantification with the empirical library (second pass) | +| **FINAL_QUANTIFICATION** | Aggregates all files into protein/peptide matrices | + +### Per-file parameters from SDRF + +These parameters are extracted per-file from the SDRF via `convert-diann` and stored in `diann_design.tsv`: + +| DIA-NN flag | SDRF column | Design column | Steps | Notes | +| ---------------- | -------------------------------------------------- | ------------------------ | ----------------------- | ----------------------------------------------- | +| `--mass-acc-ms1` | `comment[precursor mass tolerance]` | `PrecursorMassTolerance` | PRELIMINARY, INDIVIDUAL | Falls back to auto-detect if missing or not ppm | +| `--mass-acc` | `comment[fragment mass tolerance]` | `FragmentMassTolerance` | PRELIMINARY, INDIVIDUAL | Falls back to auto-detect if missing or not ppm | +| `--min-pr-mz` | `comment[ms1 scan range]` or `comment[ms min mz]` | `MS1MinMz` | PRELIMINARY, INDIVIDUAL | Per-file for GPF; global broadest for INSILICO | +| `--max-pr-mz` | `comment[ms1 scan range]` or `comment[ms max mz]` | `MS1MaxMz` | PRELIMINARY, INDIVIDUAL | Per-file for GPF; global broadest for INSILICO | +| `--min-fr-mz` | `comment[ms2 scan range]` or `comment[ms2 min mz]` | `MS2MinMz` | PRELIMINARY, INDIVIDUAL | Per-file for GPF; global broadest for INSILICO | +| `--max-fr-mz` | `comment[ms2 scan range]` or `comment[ms2 max mz]` | `MS2MaxMz` | PRELIMINARY, INDIVIDUAL | Per-file for GPF; global broadest for INSILICO | + +### Global parameters from config + +These parameters apply globally across all files. They are set in `diann_config.cfg` (from SDRF) or as pipeline parameters: + +| DIA-NN flag | Pipeline parameter | Default | Steps | Notes | +| --------------------------------------------- | -------------------------------------------------- | ----------------------------------------------- | ---------------------------------------- | --------------------------------------------------------------- | +| `--cut` | (from SDRF enzyme) | — | ALL | Enzyme cut rule, derived from `comment[cleavage agent details]` | +| `--fixed-mod` | (from SDRF) | — | ALL | Fixed modifications from `comment[modification parameters]` | +| `--var-mod` | (from SDRF) | — | ALL | Variable modifications from `comment[modification parameters]` | +| `--monitor-mod` | `--enable_mod_localization` + `--mod_localization` | `false` / `Phospho (S),Phospho (T),Phospho (Y)` | PRELIMINARY, ASSEMBLE, INDIVIDUAL, FINAL | PTM site localization scoring (DIA-NN 1.8.x only) | +| `--window` | `--scan_window` | `8` | PRELIMINARY, ASSEMBLE, INDIVIDUAL | Scan window; auto-detected when `--scan_window_automatic=true` | +| `--quick-mass-acc` | `--quick_mass_acc` | `true` | PRELIMINARY | Fast mass accuracy calibration | +| `--min-corr 2 --corr-diff 1 --time-corr-only` | `--performance_mode` | `true` | PRELIMINARY | High-speed, low-RAM mode | +| `--pg-level` | `--pg_level` | `2` | INDIVIDUAL, FINAL | Protein grouping level | +| `--species-genes` | `--species_genes` | `false` | FINAL | Use species-specific gene names | +| `--no-norm` | `--diann_normalize` | `true` | FINAL | Disable normalization when `false` | + +### PTM site localization (`--monitor-mod`) + +DIA-NN supports PTM site localization scoring via `--monitor-mod`. When enabled, DIA-NN reports `PTM.Site.Confidence` and `PTM.Q.Value` columns for the specified modifications. + +**Important**: `--monitor-mod` is applied to all DIA-NN steps **except INSILICO_LIBRARY_GENERATION** (where it has no effect). It is particularly important for: + +- **PRELIMINARY_ANALYSIS**: Affects PTM-aware scoring during calibration. +- **ASSEMBLE_EMPIRICAL_LIBRARY**: Strongly affects empirical library generation for PTM peptides. +- **INDIVIDUAL_ANALYSIS** and **FINAL_QUANTIFICATION**: Enables PTM site confidence scoring. + +Note: For DIA-NN 2.0+, `--monitor-mod` is no longer needed — PTM localization is handled automatically by `--var-mod`. The flag is only used for DIA-NN 1.8.x. + +To enable PTM site localization: + +```bash +nextflow run bigbio/quantmsdiann \ + --enable_mod_localization \ + --mod_localization 'Phospho (S),Phospho (T),Phospho (Y)' \ + ... +``` + +The parameter accepts two formats: + +- **Modification names** (quantms-compatible): `Phospho (S),Phospho (T),Phospho (Y)` — site info in parentheses is stripped, the base name is mapped to UniMod +- **UniMod accessions** (direct): `UniMod:21,UniMod:1` + +Supported modification name mappings: + +| Name | UniMod ID | Example | +| ----------- | ------------ | ------------------------------------- | +| Phospho | `UniMod:21` | `Phospho (S),Phospho (T),Phospho (Y)` | +| GlyGly | `UniMod:121` | `GlyGly (K)` | +| Acetyl | `UniMod:1` | `Acetyl (Protein N-term)` | +| Oxidation | `UniMod:35` | `Oxidation (M)` | +| Deamidated | `UniMod:7` | `Deamidated (N),Deamidated (Q)` | +| Methylation | `UniMod:34` | `Methylation (K),Methylation (R)` | + ## Optional outputs By default, only final result files are published. Intermediate files can be exported using `save_*` parameters or via `ext.*` properties in a custom Nextflow config. @@ -154,6 +243,59 @@ Use `screen`, `tmux`, or the Nextflow `-bg` flag to run the pipeline in the back nextflow run bigbio/quantmsdiann -profile docker --input sdrf.tsv --database db.fasta --outdir results -bg ``` +## Developer testing with local containers + +When developing changes to `sdrf-pipelines` or `quantms-utils`, you can build local Docker containers and test them with the pipeline without publishing to a registry. + +### 1. Build local dev containers + +```bash +# From sdrf-pipelines repo +cd /path/to/sdrf-pipelines +docker build -f Dockerfile.dev -t local/sdrf-pipelines:dev . + +# From quantms-utils repo +cd /path/to/quantms-utils +docker build -f Dockerfile.dev -t local/quantms-utils:dev . +``` + +### 2. Run the pipeline with local containers + +Use the `test_dia_local.config` to override container references: + +```bash +nextflow run main.nf \ + -profile test_dia,docker \ + -c conf/tests/test_dia_local.config \ + --outdir results +``` + +This config (`conf/tests/test_dia_local.config`) overrides: + +- `SDRF_PARSING` → `local/sdrf-pipelines:dev` +- `SAMPLESHEET_CHECK` → `local/quantms-utils:dev` +- `DIANN_MSSTATS` → `local/quantms-utils:dev` + +### 3. Using pre-converted mzML files + +To skip ThermoRawFileParser (useful on macOS/ARM where Mono crashes): + +```bash +# Convert raw files with ThermoRawFileParser v2.0+ +docker run --rm --platform=linux/amd64 \ + -v /path/to/raw:/data -v /path/to/mzml:/out \ + quay.io/biocontainers/thermorawfileparser:2.0.0.dev--h9ee0642_0 \ + ThermoRawFileParser -d /data -o /out -f 2 + +# Run pipeline with pre-converted files +nextflow run main.nf \ + -profile test_dia,docker \ + -c conf/tests/test_dia_local.config \ + --root_folder /path/to/mzml \ + --local_input_type mzML \ + --outdir results +``` + ## Nextflow memory requirements Add the following to your environment to limit Java memory: diff --git a/modules/local/diann/assemble_empirical_library/main.nf b/modules/local/diann/assemble_empirical_library/main.nf index 776b69b..034b95e 100644 --- a/modules/local/diann/assemble_empirical_library/main.nf +++ b/modules/local/diann/assemble_empirical_library/main.nf @@ -30,7 +30,8 @@ process ASSEMBLE_EMPIRICAL_LIBRARY { '--temp', '--threads', '--verbose', '--lib', '--f', '--fasta', '--mass-acc', '--mass-acc-ms1', '--window', '--individual-mass-acc', '--individual-windows', - '--out-lib', '--use-quant', '--gen-spec-lib', '--rt-profiling'] + '--out-lib', '--use-quant', '--gen-spec-lib', '--rt-profiling', + '--monitor-mod', '--var-mod', '--fixed-mod'] // Sort by length descending so longer flags (e.g. --mass-acc-ms1) are matched before shorter prefixes (--mass-acc) blocked.sort { a -> -a.length() }.each { flag -> def flagPattern = '(?<=^|\\s)' + java.util.regex.Pattern.quote(flag) + '(?=\\s|\$)(\\s+(?!-{1,2}[a-zA-Z])\\S+)*' @@ -60,8 +61,8 @@ process ASSEMBLE_EMPIRICAL_LIBRARY { ls -lcth - # Extract --var-mod and --fixed-mod flags from diann_config.cfg (DIA-NN best practice) - mod_flags=\$(cat ${diann_config} | grep -oP '(--var-mod\\s+\\S+|--fixed-mod\\s+\\S+)' | tr '\\n' ' ') + # Extract --var-mod, --fixed-mod, and --monitor-mod flags from diann_config.cfg + mod_flags=\$(cat ${diann_config} | grep -oP '(--var-mod\\s+\\S+|--fixed-mod\\s+\\S+|--monitor-mod\\s+\\S+)' | tr '\\n' ' ') diann --f ${(ms_files as List).join(' --f ')} \\ --lib ${lib} \\ diff --git a/modules/local/diann/diann_msstats/main.nf b/modules/local/diann/diann_msstats/main.nf index a767910..470309d 100644 --- a/modules/local/diann/diann_msstats/main.nf +++ b/modules/local/diann/diann_msstats/main.nf @@ -3,8 +3,8 @@ process DIANN_MSSTATS { label 'process_medium' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/quantms-utils:0.0.25--pyh106432d_0' : - 'biocontainers/quantms-utils:0.0.25--pyh106432d_0' }" + 'https://depot.galaxyproject.org/singularity/quantms-utils:0.0.28--pyh106432d_0' : + 'biocontainers/quantms-utils:0.0.28--pyh106432d_0' }" input: path(report) diff --git a/modules/local/diann/final_quantification/main.nf b/modules/local/diann/final_quantification/main.nf index 5c38b35..99740fe 100644 --- a/modules/local/diann/final_quantification/main.nf +++ b/modules/local/diann/final_quantification/main.nf @@ -46,7 +46,8 @@ process FINAL_QUANTIFICATION { '--temp', '--threads', '--verbose', '--lib', '--f', '--fasta', '--use-quant', '--matrices', '--out', '--relaxed-prot-inf', '--pg-level', '--qvalue', '--window', '--individual-windows', - '--species-genes', '--report-decoys', '--xic', '--no-norm'] + '--species-genes', '--report-decoys', '--xic', '--no-norm', + '--monitor-mod', '--var-mod', '--fixed-mod'] // Sort by length descending so longer flags (e.g. --individual-windows) are matched before shorter prefixes (--window) blocked.sort { a -> -a.length() }.each { flag -> def flagPattern = '(?<=^|\\s)' + java.util.regex.Pattern.quote(flag) + '(?=\\s|\$)(\\s+(?!-{1,2}[a-zA-Z])\\S+)*' @@ -72,8 +73,8 @@ process FINAL_QUANTIFICATION { # Notes: if .quant files are passed, mzml/.d files are not accessed, so the name needs to be passed but files # do not need to pe present. - # Extract --var-mod and --fixed-mod flags from diann_config.cfg (DIA-NN best practice) - mod_flags=\$(cat ${diann_config} | grep -oP '(--var-mod\\s+\\S+|--fixed-mod\\s+\\S+)' | tr '\\n' ' ') + # Extract --var-mod, --fixed-mod, and --monitor-mod flags from diann_config.cfg + mod_flags=\$(cat ${diann_config} | grep -oP '(--var-mod\\s+\\S+|--fixed-mod\\s+\\S+|--monitor-mod\\s+\\S+)' | tr '\\n' ' ') diann --lib ${empirical_library} \\ --fasta ${fasta} \\ diff --git a/modules/local/diann/generate_cfg/main.nf b/modules/local/diann/generate_cfg/main.nf index 9a4adef..8377030 100644 --- a/modules/local/diann/generate_cfg/main.nf +++ b/modules/local/diann/generate_cfg/main.nf @@ -3,8 +3,8 @@ process GENERATE_CFG { label 'process_tiny' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/quantms-utils:0.0.25--pyh106432d_0' : - 'biocontainers/quantms-utils:0.0.25--pyh106432d_0' }" + 'https://depot.galaxyproject.org/singularity/quantms-utils:0.0.28--pyh106432d_0' : + 'biocontainers/quantms-utils:0.0.28--pyh106432d_0' }" input: val(meta) diff --git a/modules/local/diann/individual_analysis/main.nf b/modules/local/diann/individual_analysis/main.nf index 17cc1c2..28cb3b5 100644 --- a/modules/local/diann/individual_analysis/main.nf +++ b/modules/local/diann/individual_analysis/main.nf @@ -8,7 +8,7 @@ process INDIVIDUAL_ANALYSIS { 'docker.io/biocontainers/diann:v1.8.1_cv1' }" input: - tuple val(meta), path(ms_file), path(fasta), path(diann_log), path(library) + tuple val(meta), path(ms_file), path(fasta), path(library) path(diann_config) output: @@ -25,7 +25,9 @@ process INDIVIDUAL_ANALYSIS { def blocked = ['--use-quant', '--gen-spec-lib', '--out-lib', '--matrices', '--out', '--rt-profiling', '--temp', '--threads', '--verbose', '--lib', '--f', '--fasta', '--mass-acc', '--mass-acc-ms1', '--window', - '--no-ifs-removal', '--no-main-report', '--relaxed-prot-inf', '--pg-level'] + '--no-ifs-removal', '--no-main-report', '--relaxed-prot-inf', '--pg-level', + '--min-pr-mz', '--max-pr-mz', '--min-fr-mz', '--max-fr-mz', + '--monitor-mod', '--var-mod', '--fixed-mod'] // Sort by length descending so longer flags (e.g. --mass-acc-ms1) are matched before shorter prefixes (--mass-acc) blocked.sort { a -> -a.length() }.each { flag -> def flagPattern = '(?<=^|\\s)' + java.util.regex.Pattern.quote(flag) + '(?=\\s|\$)(\\s+(?!-{1,2}[a-zA-Z])\\S+)*' @@ -42,28 +44,53 @@ process INDIVIDUAL_ANALYSIS { } } - scan_window = params.scan_window - - if (params.mass_acc_automatic | params.scan_window_automatic) { - mass_acc_ms2 = "\$(cat ${diann_log} | grep \"Averaged recommended settings\" | cut -d ' ' -f 11 | tr -cd \"[0-9]\")" - scan_window = "\$(cat ${diann_log} | grep \"Averaged recommended settings\" | cut -d ' ' -f 19 | tr -cd \"[0-9]\")" - mass_acc_ms1 = "\$(cat ${diann_log} | grep \"Averaged recommended settings\" | cut -d ' ' -f 15 | tr -cd \"[0-9]\")" - } else if (meta['precursormasstoleranceunit'].toLowerCase().endsWith('ppm') && meta['fragmentmasstoleranceunit'].toLowerCase().endsWith('ppm')) { - mass_acc_ms1 = meta["precursormasstolerance"] - mass_acc_ms2 = meta["fragmentmasstolerance"] + if (params.mass_acc_automatic || params.scan_window_automatic) { + if (meta.mass_acc_ms2 != "0" && meta.mass_acc_ms2 != null) { + mass_acc_ms2 = meta.mass_acc_ms2 + mass_acc_ms1 = meta.mass_acc_ms1 + scan_window = meta.scan_window + } + else if (meta['precursormasstoleranceunit']?.toLowerCase()?.endsWith('ppm') && meta['fragmentmasstoleranceunit']?.toLowerCase()?.endsWith('ppm')) { + mass_acc_ms2 = meta['fragmentmasstolerance'] + mass_acc_ms1 = meta['precursormasstolerance'] + scan_window = params.scan_window + } + else { + mass_acc_ms2 = params.mass_acc_ms2 + mass_acc_ms1 = params.mass_acc_ms1 + scan_window = params.scan_window + } } else { - mass_acc_ms2 = "\$(cat ${diann_log} | grep \"Averaged recommended settings\" | cut -d ' ' -f 11 | tr -cd \"[0-9]\")" - scan_window = "\$(cat ${diann_log} | grep \"Averaged recommended settings\" | cut -d ' ' -f 19 | tr -cd \"[0-9]\")" - mass_acc_ms1 = "\$(cat ${diann_log} | grep \"Averaged recommended settings\" | cut -d ' ' -f 15 | tr -cd \"[0-9]\")" + if (meta['precursormasstoleranceunit']?.toLowerCase()?.endsWith('ppm') && meta['fragmentmasstoleranceunit']?.toLowerCase()?.endsWith('ppm')) { + mass_acc_ms1 = meta["precursormasstolerance"] + mass_acc_ms2 = meta["fragmentmasstolerance"] + scan_window = params.scan_window + } + else if (meta.mass_acc_ms2 != "0" && meta.mass_acc_ms2 != null) { + mass_acc_ms2 = meta.mass_acc_ms2 + mass_acc_ms1 = meta.mass_acc_ms1 + scan_window = meta.scan_window + } + else { + mass_acc_ms2 = params.mass_acc_ms2 + mass_acc_ms1 = params.mass_acc_ms1 + scan_window = params.scan_window + } } diann_no_peptidoforms = params.diann_no_peptidoforms ? "--no-peptidoforms" : "" diann_tims_sum = params.diann_tims_sum ? "--quant-tims-sum" : "" diann_im_window = params.diann_im_window ? "--im-window $params.diann_im_window" : "" + // Per-file scan ranges from SDRF (empty = no flag, DIA-NN auto-detects) + min_pr_mz = meta['ms1minmz'] ? "--min-pr-mz ${meta['ms1minmz']}" : "" + max_pr_mz = meta['ms1maxmz'] ? "--max-pr-mz ${meta['ms1maxmz']}" : "" + min_fr_mz = meta['ms2minmz'] ? "--min-fr-mz ${meta['ms2minmz']}" : "" + max_fr_mz = meta['ms2maxmz'] ? "--max-fr-mz ${meta['ms2maxmz']}" : "" + """ - # Extract --var-mod and --fixed-mod flags from diann_config.cfg (DIA-NN best practice) - mod_flags=\$(cat ${diann_config} | grep -oP '(--var-mod\\s+\\S+|--fixed-mod\\s+\\S+)' | tr '\\n' ' ') + # Extract --var-mod, --fixed-mod, and --monitor-mod flags from diann_config.cfg + mod_flags=\$(cat ${diann_config} | grep -oP '(--var-mod\\s+\\S+|--fixed-mod\\s+\\S+|--monitor-mod\\s+\\S+)' | tr '\\n' ' ') diann --lib ${library} \\ --f ${ms_file} \\ @@ -78,6 +105,10 @@ process INDIVIDUAL_ANALYSIS { --no-main-report \\ --relaxed-prot-inf \\ --pg-level $params.pg_level \\ + ${min_pr_mz} \\ + ${max_pr_mz} \\ + ${min_fr_mz} \\ + ${max_fr_mz} \\ ${diann_no_peptidoforms} \\ ${diann_tims_sum} \\ ${diann_im_window} \\ diff --git a/modules/local/diann/individual_analysis/meta.yml b/modules/local/diann/individual_analysis/meta.yml index 655a16f..f7ffe35 100644 --- a/modules/local/diann/individual_analysis/meta.yml +++ b/modules/local/diann/individual_analysis/meta.yml @@ -10,10 +10,6 @@ tools: homepage: https://github.com/vdemichev/DiaNN documentation: https://github.com/vdemichev/DiaNN input: - - diann_log: - type: file - description: DIA-NN log file - pattern: "assemble_empirical_library.log" - empirical_library: type: file description: An empirical spectral library from the .quant files. diff --git a/modules/local/diann/insilico_library_generation/main.nf b/modules/local/diann/insilico_library_generation/main.nf index d61fc63..d76d79a 100644 --- a/modules/local/diann/insilico_library_generation/main.nf +++ b/modules/local/diann/insilico_library_generation/main.nf @@ -29,7 +29,7 @@ process INSILICO_LIBRARY_GENERATION { '--missed-cleavages', '--min-pep-len', '--max-pep-len', '--min-pr-charge', '--max-pr-charge', '--var-mods', '--min-pr-mz', '--max-pr-mz', '--min-fr-mz', '--max-fr-mz', - '--met-excision'] + '--met-excision', '--monitor-mod'] // Sort by length descending so longer flags (e.g. --fasta-search) are matched before shorter prefixes (--fasta, --f) blocked.sort { a -> -a.length() }.each { flag -> def flagPattern = '(?<=^|\\s)' + java.util.regex.Pattern.quote(flag) + '(?=\\s|\$)(\\s+(?!-{1,2}[a-zA-Z])\\S+)*' diff --git a/modules/local/diann/preliminary_analysis/main.nf b/modules/local/diann/preliminary_analysis/main.nf index 76223a4..d26215c 100644 --- a/modules/local/diann/preliminary_analysis/main.nf +++ b/modules/local/diann/preliminary_analysis/main.nf @@ -25,7 +25,9 @@ process PRELIMINARY_ANALYSIS { def blocked = ['--use-quant', '--gen-spec-lib', '--out-lib', '--matrices', '--out', '--temp', '--threads', '--verbose', '--lib', '--f', '--fasta', '--mass-acc', '--mass-acc-ms1', '--window', - '--quick-mass-acc', '--min-corr', '--corr-diff', '--time-corr-only'] + '--quick-mass-acc', '--min-corr', '--corr-diff', '--time-corr-only', + '--min-pr-mz', '--max-pr-mz', '--min-fr-mz', '--max-fr-mz', + '--monitor-mod', '--var-mod', '--fixed-mod'] // Sort by length descending so longer flags (e.g. --mass-acc-ms1) are matched before shorter prefixes (--mass-acc) blocked.sort { a -> -a.length() }.each { flag -> def flagPattern = '(?<=^|\\s)' + java.util.regex.Pattern.quote(flag) + '(?=\\s|\$)(\\s+(?!-{1,2}[a-zA-Z])\\S+)*' @@ -57,6 +59,12 @@ process PRELIMINARY_ANALYSIS { diann_tims_sum = params.diann_tims_sum ? "--quant-tims-sum" : "" diann_im_window = params.diann_im_window ? "--im-window $params.diann_im_window" : "" + // Per-file scan ranges from SDRF (empty = no flag, DIA-NN auto-detects) + min_pr_mz = meta['ms1minmz'] ? "--min-pr-mz ${meta['ms1minmz']}" : "" + max_pr_mz = meta['ms1maxmz'] ? "--max-pr-mz ${meta['ms1maxmz']}" : "" + min_fr_mz = meta['ms2minmz'] ? "--min-fr-mz ${meta['ms2minmz']}" : "" + max_fr_mz = meta['ms2maxmz'] ? "--max-fr-mz ${meta['ms2maxmz']}" : "" + """ # Precursor Tolerance value was: ${meta['precursormasstolerance']} # Fragment Tolerance value was: ${meta['fragmentmasstolerance']} @@ -65,8 +73,8 @@ process PRELIMINARY_ANALYSIS { # Final mass accuracy is '${mass_acc}' - # Extract --var-mod and --fixed-mod flags from diann_config.cfg (DIA-NN best practice) - mod_flags=\$(cat ${diann_config} | grep -oP '(--var-mod\\s+\\S+|--fixed-mod\\s+\\S+)' | tr '\\n' ' ') + # Extract --var-mod, --fixed-mod, and --monitor-mod flags from diann_config.cfg + mod_flags=\$(cat ${diann_config} | grep -oP '(--var-mod\\s+\\S+|--fixed-mod\\s+\\S+|--monitor-mod\\s+\\S+)' | tr '\\n' ' ') diann --lib ${predict_library} \\ --f ${ms_file} \\ @@ -77,6 +85,10 @@ process PRELIMINARY_ANALYSIS { ${mass_acc} \\ ${quick_mass_acc} \\ ${performance_flags} \\ + ${min_pr_mz} \\ + ${max_pr_mz} \\ + ${min_fr_mz} \\ + ${max_fr_mz} \\ ${diann_no_peptidoforms} \\ ${diann_tims_sum} \\ ${diann_im_window} \\ diff --git a/modules/local/parse_empirical_log_task/main.nf b/modules/local/parse_empirical_log_task/main.nf new file mode 100644 index 0000000..dcee2c8 --- /dev/null +++ b/modules/local/parse_empirical_log_task/main.nf @@ -0,0 +1,24 @@ +process PARSE_EMPIRICAL_LOG_TASK { + label 'process_single' + + input: + path log_file + + output: + stdout emit: parsed_vals + + script: + """ + val_mass_acc_ms2=\$(grep "Averaged recommended settings" ${log_file} | cut -d ' ' -f 11 | tr -cd "[0-9.]") + val_mass_acc_ms1=\$(grep "Averaged recommended settings" ${log_file} | cut -d ' ' -f 15 | tr -cd "[0-9.]") + val_scan_window=\$(grep "Averaged recommended settings" ${log_file} | cut -d ' ' -f 19 | tr -cd "[0-9.]") + + if [ -z "\$val_mass_acc_ms2" ]; then val_mass_acc_ms2=${params.mass_acc_ms2}; fi + if [ -z "\$val_mass_acc_ms1" ]; then val_mass_acc_ms1=${params.mass_acc_ms1}; fi + if [ -z "\$val_scan_window" ]; then val_scan_window=${params.scan_window}; fi + + CALIBRATED_PARAMS_VAL="\${val_mass_acc_ms2},\${val_mass_acc_ms1},\${val_scan_window}" + + echo -n "\$CALIBRATED_PARAMS_VAL" + """ +} diff --git a/modules/local/parse_empirical_log_task/meta.yml b/modules/local/parse_empirical_log_task/meta.yml new file mode 100644 index 0000000..df72533 --- /dev/null +++ b/modules/local/parse_empirical_log_task/meta.yml @@ -0,0 +1,26 @@ +name: "parse_empirical_log_task" +description: "Parses the empirical assembly log file (from DIA-NN) to extract calibrated mass accuracies and scan window parameters for downstream analysis." +keywords: + - quantmsdiann + - diann + - log + - parse + - proteomics + - mass_accuracy +tools: + - "coreutils": + description: "Standard GNU core utilities (grep, cut, tr, echo) used for text processing and log parsing." + homepage: "https://www.gnu.org/software/coreutils/" + documentation: "https://www.gnu.org/software/coreutils/manual/" +input: + - log_file: + type: file + description: "The log file generated by the empirical library assembly step (DIA-NN stdout/stderr log) containing the Averaged recommended settings." + pattern: "*.log" +output: + - parsed_vals: + type: string + description: "A comma-separated string containing extracted mass_acc_ms2, mass_acc_ms1, and scan_window values (e.g., '15,20,8')." + pattern: "*,*,*" +authors: + - "@bigbio" diff --git a/modules/local/pmultiqc/main.nf b/modules/local/pmultiqc/main.nf index 844fe24..f9d1964 100644 --- a/modules/local/pmultiqc/main.nf +++ b/modules/local/pmultiqc/main.nf @@ -2,8 +2,8 @@ process PMULTIQC { label 'process_high' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pmultiqc:0.0.39--pyhdfd78af_0' : - 'biocontainers/pmultiqc:0.0.39--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/pmultiqc:0.0.43--pyhdfd78af_0' : + 'biocontainers/pmultiqc:0.0.43--pyhdfd78af_0' }" input: path 'results/*' @@ -17,10 +17,10 @@ process PMULTIQC { script: def args = task.ext.args ?: '' - def disable_pmultiqc = (params.enable_pmultiqc) ? "--quantms_plugin" : "" - def disable_table_plots = (params.enable_pmultiqc) && (params.skip_table_plots) ? "--disable_table" : "" - def disable_idxml_index = (params.enable_pmultiqc) && (params.pmultiqc_idxml_skip) ? "--ignored_idxml" : "" - def contaminant_affix = params.contaminant_string ? "--contaminant_affix ${params.contaminant_string}" : "" + def disable_pmultiqc = (params.enable_pmultiqc) ? "--quantms-plugin" : "" + def disable_table_plots = (params.enable_pmultiqc) && (params.skip_table_plots) ? "--disable-table" : "" + def disable_idxml_index = (params.enable_pmultiqc) && (params.pmultiqc_idxml_skip) ? "--ignored-idxml" : "" + def contaminant_affix = params.contaminant_string ? "--contaminant-affix ${params.contaminant_string}" : "" """ set -x diff --git a/modules/local/samplesheet_check/main.nf b/modules/local/samplesheet_check/main.nf index ecb6e23..f2b7112 100644 --- a/modules/local/samplesheet_check/main.nf +++ b/modules/local/samplesheet_check/main.nf @@ -4,8 +4,8 @@ process SAMPLESHEET_CHECK { label 'process_tiny' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/quantms-utils:0.0.25--pyh106432d_0' : - 'biocontainers/quantms-utils:0.0.25--pyh106432d_0' }" + 'https://depot.galaxyproject.org/singularity/quantms-utils:0.0.28--pyh106432d_0' : + 'biocontainers/quantms-utils:0.0.28--pyh106432d_0' }" input: path input_file @@ -20,10 +20,6 @@ process SAMPLESHEET_CHECK { script: def args = task.ext.args ?: '' - def string_skip_sdrf_validation = params.validate_ontologies == false ? "--skip_sdrf_validation" : "" - def string_skip_ms_validation = params.skip_ms_validation == true ? "--skip_ms_validation" : "" - def string_skip_factor_validation = params.skip_factor_validation == true ? "--skip_factor_validation" : "" - def string_skip_experimental_design_validation = params.skip_experimental_design_validation == true ? "--skip_experimental_design_validation" : "" def string_use_ols_cache_only = params.use_ols_cache_only == true ? "--use_ols_cache_only" : "" """ @@ -40,11 +36,8 @@ process SAMPLESHEET_CHECK { cp "${input_file}" "\$OUTPUT_FILE" fi - quantmsutilsc checksamplesheet --exp_design "\$OUTPUT_FILE" --is_sdrf \\ - ${string_skip_sdrf_validation} \\ - ${string_skip_ms_validation} \\ - ${string_skip_factor_validation} \\ - ${string_skip_experimental_design_validation} \\ + quantmsutilsc checksamplesheet --exp_design "\$OUTPUT_FILE" \\ + --minimal \\ ${string_use_ols_cache_only} \\ $args \\ 2>&1 | tee input_check.log diff --git a/modules/local/samplesheet_check/meta.yml b/modules/local/samplesheet_check/meta.yml index 28ed5e4..be51717 100644 --- a/modules/local/samplesheet_check/meta.yml +++ b/modules/local/samplesheet_check/meta.yml @@ -12,9 +12,6 @@ input: type: file description: Input samplesheet or experimental design file pattern: "*.{tsv,csv,sdrf}" - - meta: validate_ontologies - type: boolean - description: Whether to validate ontologies output: - meta: log type: file diff --git a/modules/local/sdrf_parsing/main.nf b/modules/local/sdrf_parsing/main.nf index e379fac..a89bc61 100644 --- a/modules/local/sdrf_parsing/main.nf +++ b/modules/local/sdrf_parsing/main.nf @@ -3,40 +3,32 @@ process SDRF_PARSING { label 'process_tiny' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/sdrf-pipelines:0.0.33--pyhdfd78af_0' : - 'biocontainers/sdrf-pipelines:0.0.33--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/sdrf-pipelines:0.1.2--pyhdfd78af_0' : + 'biocontainers/sdrf-pipelines:0.1.2--pyhdfd78af_0' }" input: path sdrf output: - path "${sdrf.baseName}_openms_design.tsv", emit: ch_expdesign - path "${sdrf.baseName}_config.tsv" , emit: ch_sdrf_config_file - path "*.log" , emit: log - path "versions.yml" , emit: versions + path "diann_design.tsv" , emit: ch_expdesign + path "diann_config.cfg" , emit: ch_diann_cfg + path "*.log" , emit: log + path "versions.yml" , emit: versions script: def args = task.ext.args ?: '' - if (params.convert_dotd) { - extensionconversions = ",.d.gz:.mzML,.d.tar.gz:.mzML,d.tar:.mzML,.d.zip:.mzML,.d:.mzML" - } else { - extensionconversions = ",.gz:,.tar.gz:,.tar:,.zip:" - } + def mod_loc_flag = (params.enable_mod_localization && params.mod_localization) ? + "--mod_localization '${params.mod_localization}'" : '' + def diann_version_flag = params.diann_version ? "--diann_version '${params.diann_version}'" : '' """ - ## -t2 since the one-table format parser is broken in OpenMS2.5 - ## -l for legacy behavior to always add sample columns - - parse_sdrf convert-openms \\ - -t2 -l \\ - --extension_convert raw:mzML$extensionconversions \\ + parse_sdrf convert-diann \\ -s ${sdrf} \\ + ${mod_loc_flag} \\ + ${diann_version_flag} \\ $args \\ 2>&1 | tee ${sdrf.baseName}_parsing.log - mv openms.tsv ${sdrf.baseName}_config.tsv - mv experimental_design.tsv ${sdrf.baseName}_openms_design.tsv - cat <<-END_VERSIONS > versions.yml "${task.process}": sdrf-pipelines: \$(parse_sdrf --version 2>/dev/null | awk -F ' ' '{print \$2}') diff --git a/modules/local/sdrf_parsing/meta.yml b/modules/local/sdrf_parsing/meta.yml index 860f3f1..7c311f4 100644 --- a/modules/local/sdrf_parsing/meta.yml +++ b/modules/local/sdrf_parsing/meta.yml @@ -19,11 +19,7 @@ output: - ch_expdesign: type: file description: experimental design file in OpenMS format - pattern: "*openms_design.tsv" - - ch_sdrf_config_file: - type: file - description: config file with search engine parameters in OpenMS nomenclature - pattern: "*_config.tsv" + pattern: "*_design.tsv" - mqpar: type: file description: maxquant configuration file diff --git a/modules/local/utils/mzml_statistics/main.nf b/modules/local/utils/mzml_statistics/main.nf index cfa2c2b..f6a96d4 100644 --- a/modules/local/utils/mzml_statistics/main.nf +++ b/modules/local/utils/mzml_statistics/main.nf @@ -4,8 +4,8 @@ process MZML_STATISTICS { label 'process_single' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/quantms-utils:0.0.25--pyh106432d_0' : - 'biocontainers/quantms-utils:0.0.25--pyh106432d_0' }" + 'https://depot.galaxyproject.org/singularity/quantms-utils:0.0.28--pyh106432d_0' : + 'biocontainers/quantms-utils:0.0.28--pyh106432d_0' }" input: tuple val(meta), path(ms_file) diff --git a/nextflow.config b/nextflow.config index fffd373..855456b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -16,10 +16,6 @@ params { // Input options and validation of sdrf files input = null - validate_ontologies = true // Enable to validate ontology terms in the SDRF - skip_ms_validation = false // Skip the validation of the MS metadata in the SDRF - skip_factor_validation = true // Skip factor values validation, factor values are important for downstream analysis - skip_experimental_design_validation = false // Skip the validation of the experimental design in the SDRF (replicates, etc) use_ols_cache_only = true // Use only the OLS cache for ontology validation (no network requests) // Tools flags @@ -55,6 +51,7 @@ params { convert_dotd = false // DIA-NN: General + diann_version = '1.8.1' // Used to control version-dependent flags (e.g. --monitor-mod for 1.8.x) diann_debug = 3 diann_speclib = null diann_extra_args = null @@ -62,6 +59,12 @@ params { // Optional outputs — control which intermediate files are published save_speclib_tsv = false // Save the TSV spectral library from in-silico generation + // DIA-NN: PTM site localization (--monitor-mod) + enable_mod_localization = false + // Comma-separated modification names, e.g. 'Phospho (S),Phospho (T),Phospho (Y)' + // or UniMod accessions, e.g. 'UniMod:21,UniMod:1' + mod_localization = 'Phospho (S),Phospho (T),Phospho (Y)' + // DIA-NN: PRELIMINARY_ANALYSIS — calibration & mass accuracy scan_window = 8 scan_window_automatic = true @@ -78,6 +81,10 @@ params { random_preanalysis_seed = 42 empirical_assembly_ms_n = 200 + // DIA-NN: INDIVIDUAL_ANALYSIS + mass_acc_ms2 = 15 + mass_acc_ms1 = 15 + // DIA-NN: FINAL_QUANTIFICATION — summarization & output pg_level = 2 species_genes = false diff --git a/nextflow_schema.json b/nextflow_schema.json index d1215b4..c507318 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -62,32 +62,6 @@ "description": "Settings for validating the input SDRF file.", "default": "", "properties": { - "validate_ontologies": { - "type": "boolean", - "description": "Check that ontology terms in an input SDRF file exist.", - "fa_icon": "far fa-check-square", - "help_text": "If false, only a basic readability check is performed on an input SDRF file. This option is useful when ontology providers are inaccessible.", - "default": true - }, - "skip_ms_validation": { - "type": "boolean", - "description": "Skip validation of mass spectrometry files.", - "fa_icon": "far fa-check-square", - "help_text": "Skip validation of mass spectrometry metadata, including PTMs, tolerances or enzymes. Only useful if your metadata is correct but the terms are not in ontologies." - }, - "skip_factor_validation": { - "type": "boolean", - "description": "Skip validation of factor columns.", - "fa_icon": "far fa-check-square", - "help_text": "Skip validation of factor columns in the SDRF. Only useful if your factor values are correct but the sdrf-validation library does not recognize them.", - "default": true - }, - "skip_experimental_design_validation": { - "type": "boolean", - "description": "Skip validation of experimental design.", - "fa_icon": "far fa-check-square", - "help_text": "Skip validation of experimental design in the SDRF. Only useful if your experimental design is correct but the sdrf-validation library does not recognize it." - }, "use_ols_cache_only": { "type": "boolean", "description": "Use cached version of the Ontology Lookup Service (OLS).", @@ -302,6 +276,21 @@ "description": "Settings for DIA-NN - a universal software for data-independent acquisition (DIA) proteomics data processing.", "default": "", "properties": { + "diann_version": { + "type": "string", + "description": "Specify the DIA-NN version to be used in the workflow.", + "fa_icon": "fas fa-tag" + }, + "enable_mod_localization": { + "type": "boolean", + "description": "Enable or disable modification localization scoring in DIA-NN.", + "fa_icon": "fas fa-map-marker-alt" + }, + "mod_localization": { + "type": "string", + "description": "Specify the modification localization parameters for DIA-NN.", + "fa_icon": "fas fa-cogs" + }, "mass_acc_automatic": { "type": "boolean", "default": true, @@ -321,6 +310,20 @@ "help_text": " Ideally, should be approximately equal to the average number of data points per peak", "default": 8 }, + "mass_acc_ms2": { + "type": "number", + "description": "Set the MS2 mass accuracy (tolerance) to a specific value in ppm.", + "fa_icon": "fas fa-bullseye", + "help_text": "If specified, this overrides the automatic calibration. Corresponds to the --mass-acc parameter in DIA-NN.", + "default": 15 + }, + "mass_acc_ms1": { + "type": "number", + "description": "Set the MS1 mass accuracy (tolerance) to a specific value in ppm.", + "fa_icon": "fas fa-bullseye", + "help_text": "If specified, this overrides the automatic calibration. Corresponds to the --mass-acc-ms1 parameter in DIA-NN.", + "default": 15 + }, "performance_mode": { "type": "boolean", "description": "Set Low RAM & High Speed Mode for DIANN, including min-corr, corr-diff, and time-corr-only three parameters", diff --git a/subworkflows/local/create_input_channel/main.nf b/subworkflows/local/create_input_channel/main.nf index 4f5503a..5465021 100644 --- a/subworkflows/local/create_input_channel/main.nf +++ b/subworkflows/local/create_input_channel/main.nf @@ -12,11 +12,11 @@ workflow CREATE_INPUT_CHANNEL { main: ch_versions = channel.empty() - // Always parse as SDRF (OpenMS experimental design format deprecated) + // Always parse as SDRF using DIA-NN converter SDRF_PARSING(ch_sdrf) ch_versions = ch_versions.mix(SDRF_PARSING.out.versions) - ch_config = SDRF_PARSING.out.ch_sdrf_config_file ch_expdesign = SDRF_PARSING.out.ch_expdesign + ch_diann_cfg = SDRF_PARSING.out.ch_diann_cfg def Set enzymes = [] def Set files = [] @@ -26,7 +26,7 @@ workflow CREATE_INPUT_CHANNEL { experiment_id: file(ch_sdrf.toString()).baseName, ] - ch_config + ch_expdesign .splitCsv(header: true, sep: '\t') .map { row -> create_meta_channel(row, enzymes, files, wrapper) } .set { ch_meta_config_dia } @@ -34,6 +34,7 @@ workflow CREATE_INPUT_CHANNEL { emit: ch_meta_config_dia // [meta, spectra_file] ch_expdesign + ch_diann_cfg versions = ch_versions } @@ -44,7 +45,7 @@ def create_meta_channel(LinkedHashMap row, enzymes, files, wrapper) { // Always use SDRF format if (!params.root_folder) { - filestr = row.URI.toString() + filestr = row.URI?.toString()?.trim() ? row.URI.toString() : row.Filename.toString() } else { filestr = row.Filename.toString() @@ -67,30 +68,22 @@ def create_meta_channel(LinkedHashMap row, enzymes, files, wrapper) { } // Validate acquisition method is DIA - if (row["Proteomics Data Acquisition Method"].toString().toLowerCase().contains("data-independent acquisition")) { + // AcquisitionMethod is already extracted by convert-diann (e.g. "Data-Independent Acquisition") + def acqMethod = row.AcquisitionMethod?.toString()?.trim() ?: "" + if (acqMethod.toLowerCase().contains("data-independent acquisition") || acqMethod.toLowerCase().contains("dia")) { + meta.acquisition_method = "dia" + } + else if (acqMethod.isEmpty()) { + // If no acquisition method column in SDRF, assume DIA (this is a DIA-only pipeline) meta.acquisition_method = "dia" } else { - log.error("This pipeline only supports Data-Independent Acquisition (DIA). Found: '${row["Proteomics Data Acquisition Method"]}'. Use the quantms pipeline for DDA workflows.") + log.error("This pipeline only supports Data-Independent Acquisition (DIA). Found: '${acqMethod}'. Use the quantms pipeline for DDA workflows.") exit(1) } - // dissociation method conversion - if (row.DissociationMethod == "COLLISION-INDUCED DISSOCIATION") { - meta.dissociationmethod = "CID" - } - else if (row.DissociationMethod == "HIGHER ENERGY BEAM-TYPE COLLISION-INDUCED DISSOCIATION") { - meta.dissociationmethod = "HCD" - } - else if (row.DissociationMethod == "ELECTRON TRANSFER DISSOCIATION") { - meta.dissociationmethod = "ETD" - } - else if (row.DissociationMethod == "ELECTRON CAPTURE DISSOCIATION") { - meta.dissociationmethod = "ECD" - } - else { - meta.dissociationmethod = row.DissociationMethod - } + // DissociationMethod is already normalized by convert-diann (HCD, CID, ETD, ECD) + meta.dissociationmethod = row.DissociationMethod?.toString()?.trim() ?: "" wrapper.acquisition_method = meta.acquisition_method @@ -131,6 +124,7 @@ def create_meta_channel(LinkedHashMap row, enzymes, files, wrapper) { exit(1) } } else { + log.warn("No precursor mass tolerance in SDRF for '${filestr}'. Using default: ${params.precursor_mass_tolerance} ${params.precursor_mass_tolerance_unit}") meta.precursormasstolerance = params.precursor_mass_tolerance } @@ -154,6 +148,7 @@ def create_meta_channel(LinkedHashMap row, enzymes, files, wrapper) { exit(1) } } else { + log.warn("No fragment mass tolerance in SDRF for '${filestr}'. Using default: ${params.fragment_mass_tolerance} ${params.fragment_mass_tolerance_unit}") meta.fragmentmasstolerance = params.fragment_mass_tolerance } @@ -175,6 +170,12 @@ def create_meta_channel(LinkedHashMap row, enzymes, files, wrapper) { meta.variablemodifications = params.variable_mods } + // Per-file scan ranges (empty string = no flags passed, DIA-NN auto-detects) + meta.ms1minmz = row.MS1MinMz?.toString()?.trim() ?: "" + meta.ms1maxmz = row.MS1MaxMz?.toString()?.trim() ?: "" + meta.ms2minmz = row.MS2MinMz?.toString()?.trim() ?: "" + meta.ms2maxmz = row.MS2MaxMz?.toString()?.trim() ?: "" + enzymes += row.Enzyme if (enzymes.size() > 1) { log.error("Currently only one enzyme is supported for the whole experiment. Specified was '${enzymes}'. Check or split your SDRF.") diff --git a/subworkflows/local/parse_empirical_log/main.nf b/subworkflows/local/parse_empirical_log/main.nf new file mode 100644 index 0000000..0d1322d --- /dev/null +++ b/subworkflows/local/parse_empirical_log/main.nf @@ -0,0 +1,16 @@ + +include { PARSE_EMPIRICAL_LOG_TASK } from '../../../modules/local/parse_empirical_log_task' + +workflow PARSE_EMPIRICAL_LOG { + take: + ch_log_file + + main: + PARSE_EMPIRICAL_LOG_TASK(ch_log_file) + + ch_parsed_vals = PARSE_EMPIRICAL_LOG_TASK.out.parsed_vals + .ifEmpty("${params.mass_acc_ms2},${params.mass_acc_ms1},${params.scan_window}") + + emit: + parsed_vals = ch_parsed_vals +} diff --git a/subworkflows/local/parse_empirical_log/meta.yml b/subworkflows/local/parse_empirical_log/meta.yml new file mode 100644 index 0000000..f6baabb --- /dev/null +++ b/subworkflows/local/parse_empirical_log/meta.yml @@ -0,0 +1,25 @@ +name: "parse_empirical_log" +description: "Subworkflow for parsing the empirical assembly log file (from DIA-NN) to extract calibrated parameters." +keywords: + - parse + - log + - diann + - proteomics + - parameters + - mass_accuracy +components: + - parse_empirical_log_task +input: + - ch_log_file: + type: file + description: | + The log file generated by the empirical library assembly step. Can be an empty channel if the user did not provide a log file. +output: + - parsed_vals: + type: string + description: | + A value channel containing a comma-separated string of the extracted parameters (mass_acc_ms2, mass_acc_ms1, scan_window). Falls back to default pipeline parameters if the log is empty or invalid. +authors: + - "@bigbio" +maintainers: + - "@bigbio" diff --git a/workflows/dia.nf b/workflows/dia.nf index 712f6db..67e6e1e 100644 --- a/workflows/dia.nf +++ b/workflows/dia.nf @@ -7,9 +7,9 @@ // // MODULES: Local to the pipeline // -include { GENERATE_CFG } from '../modules/local/diann/generate_cfg/main' -include { DIANN_MSSTATS } from '../modules/local/diann/diann_msstats/main' +include { DIANN_MSSTATS } from '../modules/local/diann/diann_msstats/main' include { PRELIMINARY_ANALYSIS } from '../modules/local/diann/preliminary_analysis/main' +include { PARSE_EMPIRICAL_LOG } from '../subworkflows/local/parse_empirical_log/main' include { ASSEMBLE_EMPIRICAL_LIBRARY } from '../modules/local/diann/assemble_empirical_library/main' include { INSILICO_LIBRARY_GENERATION } from '../modules/local/diann/insilico_library_generation/main' include { INDIVIDUAL_ANALYSIS } from '../modules/local/diann/individual_analysis/main' @@ -30,6 +30,7 @@ workflow DIA { take: ch_file_preparation_results ch_expdesign + ch_diann_cfg main: @@ -44,12 +45,9 @@ workflow DIA { meta = ch_result.meta.unique { m -> m.experiment_id } - GENERATE_CFG(meta) - ch_software_versions = ch_software_versions - .mix(GENERATE_CFG.out.versions) - + // diann_config.cfg comes directly from SDRF_PARSING (convert-diann) // Convert to value channel so it can be consumed by all per-file processes - ch_diann_cfg = GENERATE_CFG.out.diann_cfg.first() + ch_diann_cfg_val = ch_diann_cfg.first() // // MODULE: SILICOLIBRARYGENERATION @@ -57,17 +55,32 @@ workflow DIA { if (params.diann_speclib != null && params.diann_speclib.toString() != "") { speclib = channel.from(file(params.diann_speclib, checkIfExists: true)) } else { - INSILICO_LIBRARY_GENERATION(ch_searchdb, ch_diann_cfg) + INSILICO_LIBRARY_GENERATION(ch_searchdb, ch_diann_cfg_val) speclib = INSILICO_LIBRARY_GENERATION.out.predict_speclib } if (params.skip_preliminary_analysis) { - assembly_log = channel.fromPath(params.empirical_assembly_log) - empirical_library = channel.fromPath(params.diann_speclib) - indiv_fin_analysis_in = ch_file_preparation_results.combine(ch_searchdb) - .combine(assembly_log) - .combine(empirical_library) - empirical_lib = empirical_library + if (params.empirical_assembly_log) { + ch_empirical_log = Channel.fromPath(params.empirical_assembly_log, checkIfExists: true) + } else { + ch_empirical_log = Channel.empty() + } + PARSE_EMPIRICAL_LOG(ch_empirical_log) + ch_parsed_vals = PARSE_EMPIRICAL_LOG.out.parsed_vals + indiv_fin_analysis_in = ch_file_preparation_results + .combine(ch_searchdb) + .combine(speclib) + .combine(ch_parsed_vals) + .map { meta_map, ms_file, fasta, library, param_string -> + def values = param_string.split(',') + def new_meta = meta_map + [ + mass_acc_ms2 : values[0], + mass_acc_ms1 : values[1], + scan_window : values[2] + ] + return [ new_meta, ms_file, fasta, library ] + } + empirical_lib = speclib } else { // // MODULE: PRELIMINARY_ANALYSIS @@ -80,12 +93,12 @@ workflow DIA { empirical_lib_files = preanalysis_subset .map { result -> result[1] } .collect( sort: { a, b -> file(a).getName() <=> file(b).getName() } ) - PRELIMINARY_ANALYSIS(preanalysis_subset.combine(speclib), ch_diann_cfg) + PRELIMINARY_ANALYSIS(preanalysis_subset.combine(speclib), ch_diann_cfg_val) } else { empirical_lib_files = ch_file_preparation_results .map { result -> result[1] } .collect( sort: { a, b -> file(a).getName() <=> file(b).getName() } ) - PRELIMINARY_ANALYSIS(ch_file_preparation_results.combine(speclib), ch_diann_cfg) + PRELIMINARY_ANALYSIS(ch_file_preparation_results.combine(speclib), ch_diann_cfg_val) } ch_software_versions = ch_software_versions .mix(PRELIMINARY_ANALYSIS.out.versions) @@ -99,22 +112,32 @@ workflow DIA { meta, PRELIMINARY_ANALYSIS.out.diann_quant.collect(), speclib, - ch_diann_cfg + ch_diann_cfg_val ) ch_software_versions = ch_software_versions .mix(ASSEMBLE_EMPIRICAL_LIBRARY.out.versions) + PARSE_EMPIRICAL_LOG(ASSEMBLE_EMPIRICAL_LIBRARY.out.log) + ch_parsed_vals = PARSE_EMPIRICAL_LOG.out.parsed_vals indiv_fin_analysis_in = ch_file_preparation_results .combine(ch_searchdb) - .combine(ASSEMBLE_EMPIRICAL_LIBRARY.out.log) .combine(ASSEMBLE_EMPIRICAL_LIBRARY.out.empirical_library) - + .combine(ch_parsed_vals) + .map { meta_map, ms_file, fasta, library, param_string -> + def values = param_string.trim().split(',') + def new_meta = meta_map + [ + mass_acc_ms2 : values[0], + mass_acc_ms1 : values[1], + scan_window : values[2] + ] + return [ new_meta, ms_file, fasta, library ] + } empirical_lib = ASSEMBLE_EMPIRICAL_LIBRARY.out.empirical_library } // // MODULE: INDIVIDUAL_ANALYSIS // - INDIVIDUAL_ANALYSIS(indiv_fin_analysis_in, ch_diann_cfg) + INDIVIDUAL_ANALYSIS(indiv_fin_analysis_in, ch_diann_cfg_val) ch_software_versions = ch_software_versions .mix(INDIVIDUAL_ANALYSIS.out.versions) @@ -137,7 +160,7 @@ workflow DIA { empirical_lib, INDIVIDUAL_ANALYSIS.out.diann_quant.collect(), ch_searchdb, - ch_diann_cfg) + ch_diann_cfg_val) ch_software_versions = ch_software_versions.mix( FINAL_QUANTIFICATION.out.versions @@ -162,6 +185,7 @@ workflow DIA { emit: versions = ch_software_versions diann_report = diann_main_report + diann_log = FINAL_QUANTIFICATION.out.log msstats_in = DIANN_MSSTATS.out.out_msstats } @@ -179,6 +203,10 @@ def preprocessed_meta(LinkedHashMap meta) { parameters['fragmentmasstolerance'] = meta.fragmentmasstolerance parameters['fragmentmasstoleranceunit'] = meta.fragmentmasstoleranceunit parameters['enzyme'] = meta.enzyme + parameters['ms1minmz'] = meta.ms1minmz + parameters['ms1maxmz'] = meta.ms1maxmz + parameters['ms2minmz'] = meta.ms2minmz + parameters['ms2maxmz'] = meta.ms2maxmz return parameters } diff --git a/workflows/quantmsdiann.nf b/workflows/quantmsdiann.nf index a5e4f4c..9d869ac 100644 --- a/workflows/quantmsdiann.nf +++ b/workflows/quantmsdiann.nf @@ -86,6 +86,7 @@ workflow QUANTMSDIANN { DIA( ch_fileprep_result.dia, CREATE_INPUT_CHANNEL.out.ch_expdesign, + CREATE_INPUT_CHANNEL.out.ch_diann_cfg, ) ch_pipeline_results = ch_pipeline_results.mix(DIA.out.diann_report) ch_msstats_in = ch_msstats_in.mix(DIA.out.msstats_in) @@ -115,6 +116,7 @@ workflow QUANTMSDIANN { ch_multiqc_files = ch_multiqc_files.mix(ch_multiqc_config) ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(FILE_PREPARATION.out.statistics) + ch_multiqc_files = ch_multiqc_files.mix(DIA.out.diann_log) ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml', sort: false)) ch_multiqc_quantms_logo = file("${projectDir}/assets/nf-core-quantmsdiann_logo_light.png")