diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 39597f0..e80a477 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -27,7 +27,7 @@ jobs: TEST_PROFILE: ${{ matrix.test_profile }} EXEC_PROFILE: ${{ matrix.exec_profile }} - name: "CI [${{ matrix.test_profile }}] DIA-NN=1.8.1 NXF=${{ matrix.NXF_VER }}" + name: "CI [${{ matrix.test_profile }}] NXF=${{ matrix.NXF_VER }}" if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'bigbio/quantmsdiann') }} runs-on: ubuntu-latest strategy: @@ -35,7 +35,7 @@ jobs: matrix: NXF_VER: - "25.04.0" - test_profile: ["test_dia", "test_dia_dotd"] + test_profile: ["test_dia", "test_dia_dotd", "test_dda"] exec_profile: ["docker"] # extended ci tests singularity. steps: @@ -60,11 +60,24 @@ jobs: mkdir -p $NXF_SINGULARITY_CACHEDIR mkdir -p $NXF_SINGULARITY_LIBRARYDIR + - name: Log in to GitHub Container Registry + if: matrix.test_profile == 'test_dda' + env: + GHCR_TOKEN: ${{ secrets.GHCR_TOKEN }} + GHCR_USERNAME: ${{ secrets.GHCR_USERNAME }} + run: | + if [ -z "$GHCR_TOKEN" ] || [ -z "$GHCR_USERNAME" ]; then + echo "::warning::Skipping test_dda: GHCR credentials not available (expected for fork PRs)" + echo "SKIP_DDA=true" >> $GITHUB_ENV + exit 0 + fi + echo "$GHCR_TOKEN" | docker login ghcr.io -u "$GHCR_USERNAME" --password-stdin + - name: Disk space cleanup uses: jlumbroso/free-disk-space@v1.3.1 - name: Run pipeline with test data in docker/singularity profile - if: github.event.pull_request.base.ref != 'master' + if: github.event.pull_request.base.ref != 'master' && env.SKIP_DDA != 'true' run: | nextflow run ${GITHUB_WORKSPACE} -profile $TEST_PROFILE,$EXEC_PROFILE,dev --outdir ${TEST_PROFILE}_${EXEC_PROFILE}_results diff --git a/.github/workflows/extended_ci.yml b/.github/workflows/extended_ci.yml index 42a9f48..621ab2c 100644 --- a/.github/workflows/extended_ci.yml +++ b/.github/workflows/extended_ci.yml @@ -114,7 +114,8 @@ jobs: strategy: fail-fast: false matrix: - test_profile: ["test_latest_dia", "test_dia_quantums", "test_dia_parquet"] + test_profile: + ["test_latest_dia", "test_dia_quantums", "test_dia_parquet", "test_dda", "test_dia_skip_preanalysis"] env: NXF_ANSI_LOG: false CAPSULE_LOG: none diff --git a/.gitignore b/.gitignore index 52812e6..6a54d55 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,4 @@ null/ .codacy/ .github/instructions/codacy.instructions.md docs/superpowers/ +docs/plans/ diff --git a/AGENTS.md b/AGENTS.md index 2c32d31..982fcfe 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -80,7 +80,7 @@ quantmsdiann/ │ ├── pmultiqc/ # QC reporting │ ├── sdrf_parsing/ # SDRF parsing │ ├── samplesheet_check/ # Input validation -│ └── utils/ # tdf2mzml, decompress, mzml stats +│ └── utils/ # decompress, mzml stats ├── conf/ │ ├── base.config # Resource definitions │ ├── modules/ # Module-specific configs @@ -97,7 +97,7 @@ quantmsdiann/ The pipeline executes the following steps: 1. **SDRF Validation & Parsing** - Validates input SDRF and extracts metadata -2. **File Preparation** - Converts RAW/mzML/.d/.dia files (ThermoRawFileParser, tdf2mzml) +2. **File Preparation** - Converts RAW/mzML/.d/.dia files (ThermoRawFileParser) 3. **Generate Config** - Creates DIA-NN config from enzyme/modifications (`quantmsutilsc dianncfg`) 4. **In-Silico Library Generation** - Predicts spectral library from FASTA (or uses provided library) 5. **Preliminary Analysis** - Per-file calibration and mass accuracy determination diff --git a/CITATIONS.md b/CITATIONS.md index d74e0f9..8985a13 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -1,6 +1,6 @@ # bigbio/quantmsdiann: Citations -## [Pipeline](https://www.researchsquare.com/article/rs-3002027/v1) +## [Pipeline](https://doi.org/10.1038/s41592-024-02343-1) > Dai C, Pfeuffer J, Wang H, Zheng P, Käll L, Sachsenberg T, Demichev V, Bai M, Kohlbacher O, Perez-Riverol Y. quantms: a cloud-based pipeline for quantitative proteomics enables the reanalysis of public proteomics data. Nat Methods. 2024 Jul 4. doi: 10.1038/s41592-024-02343-1. Epub ahead of print. PMID: 38965444. diff --git a/README.md b/README.md index 67c2de6..ee88b0e 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![GitHub Actions CI Status](https://github.com/bigbio/quantmsdiann/actions/workflows/ci.yml/badge.svg)](https://github.com/bigbio/quantmsdiann/actions/workflows/ci.yml) [![GitHub Actions Linting Status](https://github.com/bigbio/quantmsdiann/actions/workflows/linting.yml/badge.svg)](https://github.com/bigbio/quantmsdiann/actions/workflows/linting.yml) -[![Cite with Zenodo](https://zenodo.org/badge/DOI/10.5281/zenodo.15573386.svg)](https://doi.org/10.5281/zenodo.15573386) +[![Cite with Zenodo](https://zenodo.org/badge/DOI/10.5281/zenodo.19437128.svg)](https://doi.org/10.5281/zenodo.19437128) [![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com) [![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.04.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/) @@ -25,7 +25,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool The pipeline takes [SDRF](https://github.com/bigbio/proteomics-metadata-standard) metadata and mass spectrometry data files (`.raw`, `.mzML`, `.d`, `.dia`) as input and performs: 1. **Input validation** — SDRF parsing and validation via [sdrf-pipelines](https://github.com/bigbio/sdrf-pipelines) -2. **File preparation** — RAW to mzML conversion ([ThermoRawFileParser](https://github.com/compomics/ThermoRawFileParser)), indexing, Bruker `.d` handling ([tdf2mzml](https://github.com/bigbio/tdf2mzml)) +2. **File preparation** — RAW to mzML conversion ([ThermoRawFileParser](https://github.com/compomics/ThermoRawFileParser)), indexing 3. **In-silico spectral library generation** — deep learning-based prediction, or use a user-provided library (`--diann_speclib`) 4. **Preliminary analysis** — per-file calibration and mass accuracy estimation (parallelized) 5. **Empirical library assembly** — consensus library from preliminary results with RT profiling @@ -103,7 +103,7 @@ If you would like to contribute to this pipeline, please see the [contributing g If you use quantmsdiann in your research, please cite: -> Dai et al. "quantms: a cloud-based pipeline for quantitative proteomics" (2024). DOI: [10.5281/zenodo.15573386](https://doi.org/10.5281/zenodo.15573386) +> Dai et al. "quantms: a cloud-based pipeline for quantitative proteomics" (2024). DOI: [10.5281/zenodo.19437128](https://doi.org/10.5281/zenodo.19437128) An extensive list of references for the tools used by the pipeline can be found in the [CITATIONS.md](CITATIONS.md) file. diff --git a/assets/schema_input.json b/assets/schema_input.json index 7b15010..1699aad 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -11,7 +11,7 @@ "source name": { "type": "string", "pattern": "^\\S+$", - "errorMessage": "Sample name must be provided and cannot contain spaces" + "errorMessage": "Source name must be provided and cannot contain spaces" }, "comment[data file]": { "type": "string", @@ -22,7 +22,7 @@ "assay name": { "type": "string", "pattern": "^\\S+$", - "errorMessage": "Assay name must be provided and cannot contain spaces", + "errorMessage": "Assay name must be provided and cannot contain whitespace", "meta": ["assay"] } } diff --git a/conf/diann_versions/v2_3_2.config b/conf/diann_versions/v2_3_2.config new file mode 100644 index 0000000..2912f15 --- /dev/null +++ b/conf/diann_versions/v2_3_2.config @@ -0,0 +1,14 @@ +/* + * DIA-NN 2.3.2 container override (private ghcr.io) + * Latest release with DDA support and InfinDIA. + */ +params.diann_version = '2.3.2' + +process { + withLabel: diann { + container = 'ghcr.io/bigbio/diann:2.3.2' + } +} + +singularity.enabled = false +docker.enabled = true diff --git a/conf/tests/test_dda.config b/conf/tests/test_dda.config new file mode 100644 index 0000000..f7870c4 --- /dev/null +++ b/conf/tests/test_dda.config @@ -0,0 +1,53 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for testing DDA analysis (requires DIA-NN >= 2.3.2) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Tests DDA mode using the PXD022287 HeLa dataset with --diann_dda flag. + Uses ghcr.io/bigbio/diann:2.3.2. + + Use as follows: + nextflow run bigbio/quantmsdiann -profile test_dda,docker [--outdir ] + +------------------------------------------------------------------------------------------------ +*/ + +process { + resourceLimits = [ + cpus: 4, + memory: '12.GB', + time: '48.h' + ] +} + +params { + config_profile_name = 'Test profile for DDA analysis' + config_profile_description = 'DDA test using PXD022287 HeLa dataset with DIA-NN 2.3.2.' + + outdir = './results_dda' + + // Input data - PXD022287 HeLa DDA dataset + input = 'https://raw.githubusercontent.com/bigbio/quantms-test-datasets/quantms/testdata/diann_dda_ci/PXD022287.sdrf.tsv' + database = 'https://raw.githubusercontent.com/bigbio/quantms-test-datasets/quantms/testdata/diann_dda_ci/PXD022287_subset_human.fasta' + + // DDA mode + diann_dda = true + diann_version = '2.3.2' + + // Search parameters matching PXD022287 HeLa dataset + min_peptide_length = 7 + max_peptide_length = 30 + max_precursor_charge = 3 + allowed_missed_cleavages = 1 + diann_normalize = false + publish_dir_mode = 'symlink' + max_mods = 2 +} + +process { + withLabel: diann { + container = 'ghcr.io/bigbio/diann:2.3.2' + } +} + +singularity.enabled = false +docker.enabled = true diff --git a/conf/tests/test_dia_skip_preanalysis.config b/conf/tests/test_dia_skip_preanalysis.config new file mode 100644 index 0000000..e1968b5 --- /dev/null +++ b/conf/tests/test_dia_skip_preanalysis.config @@ -0,0 +1,48 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for testing skip_preliminary_analysis path +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Tests the pipeline with skip_preliminary_analysis=true, using default + mass accuracy parameters. Validates the untested code path in dia.nf. + + Use as follows: + nextflow run bigbio/quantmsdiann -profile test_dia_skip_preanalysis,docker [--outdir ] + +------------------------------------------------------------------------------------------------ +*/ + +process { + resourceLimits = [ + cpus: 4, + memory: '12.GB', + time: '48.h' + ] +} + +params { + config_profile_name = 'Test profile for skip preliminary analysis' + config_profile_description = 'Tests skip_preliminary_analysis path with default mass accuracy params.' + + outdir = './results_skip_preanalysis' + + // Input data - same as test_dia + input = 'https://raw.githubusercontent.com/bigbio/quantms-test-datasets/quantms/testdata/dia_ci/PXD026600.sdrf.tsv' + database = 'https://raw.githubusercontent.com/bigbio/quantms-test-datasets/quantms/testdata/dia_ci/REF_EColi_K12_UPS1_combined.fasta' + min_pr_mz = 350 + max_pr_mz = 950 + min_fr_mz = 500 + max_fr_mz = 1500 + min_peptide_length = 15 + max_peptide_length = 30 + max_precursor_charge = 3 + allowed_missed_cleavages = 1 + diann_normalize = false + publish_dir_mode = 'symlink' + max_mods = 2 + + // Skip preliminary analysis - use default mass accuracy params + skip_preliminary_analysis = true + mass_acc_ms2 = 15 + mass_acc_ms1 = 15 + scan_window = 8 +} diff --git a/docs/parameters.md b/docs/parameters.md index 0f25c4d..f3c76c0 100644 --- a/docs/parameters.md +++ b/docs/parameters.md @@ -27,6 +27,7 @@ This document lists every pipeline parameter organised by category. Default valu | `--reindex_mzml` | boolean | `true` | Force re-indexing of input mzML files at the start of the pipeline for safety. | | `--mzml_statistics` | boolean | `false` | Compute MS1/MS2 statistics from mzML files. Generates `*_ms_info.parquet` files for QC. Bruker `.d` files are always skipped. | | `--mzml_features` | boolean | `false` | Compute MS1-level features during the mzML statistics step. Only available for mzML files. | +| `--convert_dotd` | boolean | `false` | Convert Bruker .d files to mzML format before processing. | ## 4. Search Parameters @@ -51,12 +52,16 @@ This document lists every pipeline parameter organised by category. Default valu ## 5. DIA-NN General -| Parameter | Type | Default | Description | -| -------------------- | ------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `--diann_version` | string | `1.8.1` | DIA-NN version used by the workflow. Controls version-dependent flags (e.g. `--monitor-mod` for 1.8.x). See [DIA-NN Version Selection](usage.md#dia-nn-version-selection). | -| `--diann_debug` | integer | `3` | DIA-NN debug/verbosity level (0-4). Higher values produce more verbose logs. | -| `--diann_speclib` | string | `null` | Path to an external spectral library. If provided, the in-silico library generation step is skipped. | -| `--diann_extra_args` | string | `null` | Extra arguments appended to all DIA-NN steps. Flags incompatible with a step are automatically stripped with a warning. See [Passing Extra Arguments to DIA-NN](usage.md#passing-extra-arguments-to-dia-nn). | +| Parameter | Type | Default | Description | +| ------------------------ | ------- | ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `--diann_version` | string | `1.8.1` | DIA-NN version used by the workflow. Controls version-dependent flags (e.g. `--monitor-mod` for 1.8.x). See [DIA-NN Version Selection](usage.md#dia-nn-version-selection). | +| `--diann_debug` | integer | `3` | DIA-NN debug/verbosity level (0-4). Higher values produce more verbose logs. | +| `--diann_speclib` | string | `null` | Path to an external spectral library. If provided, the in-silico library generation step is skipped. | +| `--diann_extra_args` | string | `null` | Extra arguments appended to all DIA-NN steps. Flags incompatible with a step are automatically stripped with a warning. See [Passing Extra Arguments to DIA-NN](usage.md#passing-extra-arguments-to-dia-nn). | +| `--diann_dda` | boolean | `false` | Explicitly enable DDA mode. Normally auto-detected from the SDRF `comment[proteomics data acquisition method]` column. Use this flag only when the SDRF lacks the acquisition method. Requires DIA-NN >= 2.3.2. | +| `--diann_light_models` | boolean | `false` | Enable `--light-models` for 10x faster in-silico library generation. Requires DIA-NN >= 2.0. | +| `--diann_export_quant` | boolean | `false` | Enable `--export-quant` for fragment-level parquet data export. Requires DIA-NN >= 2.0. | +| `--diann_site_ms1_quant` | boolean | `false` | Enable `--site-ms1-quant` to use MS1 apex intensities for PTM site quantification. Requires DIA-NN >= 2.0. | ## 6. Mass Accuracy & Calibration @@ -92,12 +97,13 @@ This document lists every pipeline parameter organised by category. Default valu ## 10. Preliminary Analysis -| Parameter | Type | Default | Description | -| ----------------------------- | ------- | ------- | ------------------------------------------------------------------------------------------------------------------- | -| `--skip_preliminary_analysis` | boolean | `false` | Skip preliminary analysis. Use the provided spectral library as-is instead of generating a local consensus library. | -| `--random_preanalysis` | boolean | `false` | Enable random selection of spectrum files for empirical library generation. | -| `--random_preanalysis_seed` | integer | `42` | Random seed for file selection when `--random_preanalysis` is enabled. | -| `--empirical_assembly_ms_n` | integer | `200` | Number of randomly selected spectrum files when `--random_preanalysis` is enabled. | +| Parameter | Type | Default | Description | +| ----------------------------- | ------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------ | +| `--skip_preliminary_analysis` | boolean | `false` | Skip preliminary analysis. Use the provided spectral library as-is instead of generating a local consensus library. | +| `--empirical_assembly_log` | string | `null` | Path to a pre-existing empirical assembly log file. Only used when `--skip_preliminary_analysis true` and `--diann_speclib` are set. | +| `--random_preanalysis` | boolean | `false` | Enable random selection of spectrum files for empirical library generation. | +| `--random_preanalysis_seed` | integer | `42` | Random seed for file selection when `--random_preanalysis` is enabled. | +| `--empirical_assembly_ms_n` | integer | `200` | Number of randomly selected spectrum files when `--random_preanalysis` is enabled. | ## 11. Quantification & Output @@ -115,7 +121,27 @@ This document lists every pipeline parameter organised by category. Default valu | `--quantums_sel_runs` | integer | `null` | Number of automatically selected runs for QuantUMS training. Must be >= 6. Maps to `--quant-sel-runs`. Requires DIA-NN >= 1.9.2. | | `--quantums_params` | string | `null` | Pre-calculated QuantUMS parameters. Maps to `--quant-params`. Requires DIA-NN >= 1.9.2. | -## 12. Quality Control +## 12. DDA Mode + +| Parameter | Type | Default | Description | +| ------------- | ------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `--diann_dda` | boolean | `false` | Explicitly enable DDA mode when SDRF lacks the acquisition method column. Normally DDA is auto-detected from the SDRF `comment[proteomics data acquisition method]`. Requires DIA-NN >= 2.3.2 (use `-profile diann_v2_3_2`). Beta feature. | + +> **Note:** DDA mode is auto-detected from the SDRF when the `comment[proteomics data acquisition method]` +> column contains `data-dependent acquisition`. The `--diann_dda` flag is only needed as a +> fallback when the SDRF does not include this column. DDA requires DIA-NN >= 2.3.2 +> (`-profile diann_v2_3_2`). + +## 13. InfinDIA (Experimental) + +| Parameter | Type | Default | Description | +| -------------------- | ------- | ------- | -------------------------------------------------------------------------------------- | +| `--enable_infin_dia` | boolean | `false` | Enable InfinDIA for ultra-large search spaces. Requires DIA-NN >= 2.3.0. Experimental. | +| `--diann_pre_select` | integer | `null` | Precursor limit (`--pre-select N`) for InfinDIA pre-search. | + +> **Note:** InfinDIA requires DIA-NN >= 2.3.0 and is considered experimental. + +## 14. Quality Control | Parameter | Type | Default | Description | | ---------------------------- | ------- | ------- | ------------------------------------------------------------------------------------ | @@ -124,7 +150,7 @@ This document lists every pipeline parameter organised by category. Default valu | `--contaminant_string` | string | `CONT` | Contaminant affix string for pmultiqc. Maps to `--contaminant_affix` in pmultiqc. | | `--protein_level_fdr_cutoff` | number | `0.01` | Experiment-wide protein (group)-level FDR cutoff. | -## 13. MultiQC & Reporting +## 15. MultiQC & Reporting | Parameter | Type | Default | Description | | ------------------------------- | ------------------ | ------- | --------------------------------------------------------------------------------- | diff --git a/docs/usage.md b/docs/usage.md index b04cd7d..8b696df 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -64,6 +64,75 @@ For Synchro-PASEF data, enable `--diann_tims_sum` (which adds `--quant-tims-sum` > [!NOTE] > The pipeline will emit a warning during PRELIMINARY_ANALYSIS if it detects `.d` files with automatic mass accuracy calibration enabled, recommending to set tolerances via SDRF or pipeline parameters. +### DDA Analysis Mode (Beta) + +DIA-NN 2.3.2+ supports DDA data analysis via the `--dda` flag. The pipeline **auto-detects DDA mode** from the SDRF `comment[proteomics data acquisition method]` column — no extra flags needed if your SDRF contains `data-dependent acquisition`: + +```bash +nextflow run bigbio/quantmsdiann \ + --input dda_sdrf.tsv \ + --database proteins.fasta \ + -profile diann_v2_3_2,docker +``` + +If your SDRF does not include the acquisition method column, you can explicitly enable DDA mode with `--diann_dda true`: + +```bash +nextflow run bigbio/quantmsdiann \ + --input sdrf.tsv \ + --database proteins.fasta \ + --diann_dda true \ + -profile diann_v2_3_2,docker +``` + +**Limitations (beta feature):** + +- Only trust: q-values, PEP values, RT/IM values, Ms1.Apex.Area, Normalisation.Factor +- PTM localization probabilities are **unreliable** with DDA data +- MBR requires MS2-level evidence (DIA-like, not classical DDA MBR) +- No isobaric labeling or reporter-tag quantification +- Primary use cases: legacy DDA reanalysis, spectral library creation, immunopeptidomics + +The pipeline uses the same workflow for DDA as DIA — the `--dda` flag is passed to all DIA-NN steps automatically when DDA is detected from the SDRF or enabled via `--diann_dda`. + +### Preprocessing Options + +- `--reindex_mzml` (default: true) — Re-index mzML files before processing. Disable with `--reindex_mzml false` if files are already indexed. +- `--mzml_statistics` (default: false) — Generate mzML statistics (parquet format) for QC. +- `--mzml_features` (default: false) — Enable feature detection in mzML statistics. +- `--convert_dotd` (default: false) — Convert Bruker .d files to mzML via tdf2mzml instead of passing natively to DIA-NN. + +### Passing Extra Arguments to DIA-NN + +Use `--diann_extra_args` to pass additional flags to all DIA-NN steps. The pipeline validates and strips flags it manages internally to prevent conflicts. + +Managed flags (stripped with a warning if passed via extra_args): `--lib`, `--f`, `--fasta`, `--threads`, `--verbose`, `--temp`, `--out`, `--matrices`, `--use-quant`, `--gen-spec-lib`, `--mass-acc`, `--mass-acc-ms1`, `--window`, `--var-mod`, `--fixed-mod`, `--monitor-mod`, and others. + +To enable this, add `includeConfig 'conf/modules/dia.config'` to your configuration (already included by default). + +### DIA-NN Version Selection + +The default DIA-NN version is 1.8.1. To use a different version: + +| Version | Profile | Features | +| ------- | ----------------------- | ----------------------------------- | +| 1.8.1 | (default) | Core DIA analysis | +| 2.1.0 | `-profile diann_v2_1_0` | Native .raw support, reduced memory | +| 2.2.0 | `-profile diann_v2_2_0` | Speed optimizations | +| 2.3.2 | `-profile diann_v2_3_2` | DDA support, InfinDIA | + +Example: `nextflow run bigbio/quantmsdiann -profile test_dia,docker,diann_v2_2_0` + +### Verbose Module Output + +Use `-profile verbose_modules` to publish intermediate files from all pipeline steps: + +```bash +nextflow run bigbio/quantmsdiann -profile test_dia,docker,verbose_modules --outdir results +``` + +This publishes ThermoRawFileParser conversions, mzML indexing results, per-file DIA-NN logs, and spectral library intermediates. + ### Pipeline settings via params file Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `: diff --git a/lib/VersionUtils.groovy b/lib/VersionUtils.groovy new file mode 100644 index 0000000..f340a61 --- /dev/null +++ b/lib/VersionUtils.groovy @@ -0,0 +1,34 @@ +/** + * Semantic version comparison utility for DIA-NN version guards. + * + * Nextflow auto-loads all classes in lib/, so these are available + * in workflows and module scripts without explicit imports. + */ +class VersionUtils { + + /** + * Compare two version strings semantically (e.g. '2.10.0' > '2.3.2'). + * Returns negative if a < b, zero if equal, positive if a > b. + */ + static int compare(String a, String b) { + def partsA = a.tokenize('.').collect { it.isInteger() ? it.toInteger() : 0 } + def partsB = b.tokenize('.').collect { it.isInteger() ? it.toInteger() : 0 } + def maxLen = Math.max(partsA.size(), partsB.size()) + for (int i = 0; i < maxLen; i++) { + int va = i < partsA.size() ? partsA[i] : 0 + int vb = i < partsB.size() ? partsB[i] : 0 + if (va != vb) return va <=> vb + } + return 0 + } + + /** True if version is strictly less than required. */ + static boolean versionLessThan(String version, String required) { + return compare(version, required) < 0 + } + + /** True if version is greater than or equal to required. */ + static boolean versionAtLeast(String version, String required) { + return compare(version, required) >= 0 + } +} diff --git a/modules/local/diann/assemble_empirical_library/main.nf b/modules/local/diann/assemble_empirical_library/main.nf index 2bc0525..2bfb67e 100644 --- a/modules/local/diann/assemble_empirical_library/main.nf +++ b/modules/local/diann/assemble_empirical_library/main.nf @@ -2,6 +2,7 @@ process ASSEMBLE_EMPIRICAL_LIBRARY { tag "$meta.experiment_id" label 'process_low' label 'diann' + label 'error_retry' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://containers.biocontainers.pro/s3/SingImgsRepo/diann/v1.8.1_cv1/diann_v1.8.1_cv1.img' : @@ -31,7 +32,7 @@ process ASSEMBLE_EMPIRICAL_LIBRARY { '--mass-acc', '--mass-acc-ms1', '--window', '--individual-mass-acc', '--individual-windows', '--out-lib', '--use-quant', '--gen-spec-lib', '--rt-profiling', - '--monitor-mod', '--var-mod', '--fixed-mod', + '--monitor-mod', '--var-mod', '--fixed-mod', '--dda', '--channels', '--lib-fixed-mod', '--original-mods'] // Sort by length descending so longer flags (e.g. --mass-acc-ms1) are matched before shorter prefixes (--mass-acc) blocked.sort { a -> -a.length() }.each { flag -> @@ -53,6 +54,7 @@ process ASSEMBLE_EMPIRICAL_LIBRARY { diann_no_peptidoforms = params.diann_no_peptidoforms ? "--no-peptidoforms" : "" diann_tims_sum = params.diann_tims_sum ? "--quant-tims-sum" : "" diann_im_window = params.diann_im_window ? "--im-window $params.diann_im_window" : "" + diann_dda_flag = meta.acquisition_method == 'dda' ? "--dda" : "" """ # Precursor Tolerance value was: ${meta['precursormasstolerance']} @@ -79,6 +81,7 @@ process ASSEMBLE_EMPIRICAL_LIBRARY { ${diann_no_peptidoforms} \\ ${diann_tims_sum} \\ ${diann_im_window} \\ + ${diann_dda_flag} \\ \${mod_flags} \\ $args diff --git a/modules/local/diann/diann_msstats/main.nf b/modules/local/diann/diann_msstats/main.nf index 4374b58..1844fdc 100644 --- a/modules/local/diann/diann_msstats/main.nf +++ b/modules/local/diann/diann_msstats/main.nf @@ -18,6 +18,7 @@ process DIANN_MSSTATS { script: def args = task.ext.args ?: '' """ + set -o pipefail quantmsutilsc diann2msstats \\ --report ${report} \\ --exp_design ${exp_design} \\ diff --git a/modules/local/diann/diann_msstats/meta.yml b/modules/local/diann/diann_msstats/meta.yml index ac1f147..a440d61 100644 --- a/modules/local/diann/diann_msstats/meta.yml +++ b/modules/local/diann/diann_msstats/meta.yml @@ -17,7 +17,7 @@ input: pattern: "*.tsv" - exp_design: type: file - description: An experimental design file including Sample and replicates column et al. + description: An experimental design file including Sample and replicates column etc. pattern: "*.tsv" - report_pr: type: file diff --git a/modules/local/diann/final_quantification/main.nf b/modules/local/diann/final_quantification/main.nf index cb7d481..f83af13 100644 --- a/modules/local/diann/final_quantification/main.nf +++ b/modules/local/diann/final_quantification/main.nf @@ -2,6 +2,7 @@ process FINAL_QUANTIFICATION { tag "$meta.experiment_id" label 'process_high' label 'diann' + label 'error_retry' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://containers.biocontainers.pro/s3/SingImgsRepo/diann/v1.8.1_cv1/diann_v1.8.1_cv1.img' : @@ -24,16 +25,17 @@ process FINAL_QUANTIFICATION { path "diann_report.{tsv,parquet}", emit: main_report, optional: true path "diann_report.manifest.txt", emit: report_manifest, optional: true path "diann_report.protein_description.tsv", emit: protein_description, optional: true - path "diann_report.stats.tsv", emit: report_stats - path "diann_report.pr_matrix.tsv", emit: pr_matrix - path "diann_report.pg_matrix.tsv", emit: pg_matrix - path "diann_report.gg_matrix.tsv", emit: gg_matrix - path "diann_report.unique_genes_matrix.tsv", emit: unique_gene_matrix + path "diann_report.stats.tsv", emit: report_stats, optional: true + path "diann_report.pr_matrix.tsv", emit: pr_matrix, optional: true + path "diann_report.pg_matrix.tsv", emit: pg_matrix, optional: true + path "diann_report.gg_matrix.tsv", emit: gg_matrix, optional: true + path "diann_report.unique_genes_matrix.tsv", emit: unique_gene_matrix, optional: true path "diannsummary.log", emit: log // Different library files format are exported due to different DIA-NN versions path "empirical_library.tsv", emit: final_speclib, optional: true path "empirical_library.tsv.skyline.speclib", emit: skyline_speclib, optional: true + path "*.site_report.parquet", emit: site_report, optional: true path "versions.yml", emit: versions when: @@ -47,7 +49,7 @@ process FINAL_QUANTIFICATION { '--use-quant', '--matrices', '--out', '--relaxed-prot-inf', '--pg-level', '--qvalue', '--window', '--individual-windows', '--species-genes', '--report-decoys', '--xic', '--no-norm', - '--monitor-mod', '--var-mod', '--fixed-mod', + '--monitor-mod', '--var-mod', '--fixed-mod', '--dda', '--export-quant', '--site-ms1-quant', '--channels', '--lib-fixed-mod', '--original-mods'] // Sort by length descending so longer flags (e.g. --individual-windows) are matched before shorter prefixes (--window) blocked.sort { a -> -a.length() }.each { flag -> @@ -63,13 +65,18 @@ process FINAL_QUANTIFICATION { no_norm = params.diann_normalize ? "" : "--no-norm" report_decoys = params.diann_report_decoys ? "--report-decoys": "" diann_export_xic = params.diann_export_xic ? "--xic": "" - // --direct-quant only exists in DIA-NN >= 1.9.2 (QuantUMS counterpart); skip for older versions - quantums = params.quantums ? "" : (params.diann_version >= '1.9' ? "--direct-quant" : "") + // --direct-quant exists in DIA-NN >= 1.9.2 (QuantUMS counterpart); skip for older versions + quantums = params.quantums ? "" : (VersionUtils.versionAtLeast(params.diann_version, '1.9.2') ? "--direct-quant" : "") quantums_train_runs = params.quantums_train_runs ? "--quant-train-runs $params.quantums_train_runs": "" quantums_sel_runs = params.quantums_sel_runs ? "--quant-sel-runs $params.quantums_sel_runs": "" quantums_params = params.quantums_params ? "--quant-params $params.quantums_params": "" diann_no_peptidoforms = params.diann_no_peptidoforms ? "--no-peptidoforms" : "" diann_use_quant = params.diann_use_quant ? "--use-quant" : "" + diann_dda_flag = meta.acquisition_method == 'dda' ? "--dda" : "" + diann_export_quant = params.diann_export_quant ? "--export-quant" : "" + diann_site_ms1_quant = params.diann_site_ms1_quant ? "--site-ms1-quant" : "" + diann_channel_run_norm = params.diann_channel_run_norm ? "--channel-run-norm" : "" + diann_channel_spec_norm = params.diann_channel_spec_norm ? "--channel-spec-norm" : "" """ # Notes: if .quant files are passed, mzml/.d files are not accessed, so the name needs to be passed but files @@ -99,6 +106,11 @@ process FINAL_QUANTIFICATION { ${quantums_params} \\ ${diann_no_peptidoforms} \\ ${diann_use_quant} \\ + ${diann_dda_flag} \\ + ${diann_export_quant} \\ + ${diann_site_ms1_quant} \\ + ${diann_channel_run_norm} \\ + ${diann_channel_spec_norm} \\ \${mod_flags} \\ $args diff --git a/modules/local/diann/generate_cfg/main.nf b/modules/local/diann/generate_cfg/main.nf index 8377030..8e36641 100644 --- a/modules/local/diann/generate_cfg/main.nf +++ b/modules/local/diann/generate_cfg/main.nf @@ -18,6 +18,7 @@ process GENERATE_CFG { def args = task.ext.args ?: '' """ + set -o pipefail quantmsutilsc dianncfg \\ --enzyme "${meta.enzyme}" \\ --fix_mod "${meta.fixedmodifications}" \\ diff --git a/modules/local/diann/individual_analysis/main.nf b/modules/local/diann/individual_analysis/main.nf index 3bdccae..975ceaf 100644 --- a/modules/local/diann/individual_analysis/main.nf +++ b/modules/local/diann/individual_analysis/main.nf @@ -2,6 +2,7 @@ process INDIVIDUAL_ANALYSIS { tag "$ms_file.baseName" label 'process_high' label 'diann' + label 'error_retry' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://containers.biocontainers.pro/s3/SingImgsRepo/diann/v1.8.1_cv1/diann_v1.8.1_cv1.img' : @@ -27,7 +28,7 @@ process INDIVIDUAL_ANALYSIS { '--mass-acc', '--mass-acc-ms1', '--window', '--no-ifs-removal', '--no-main-report', '--relaxed-prot-inf', '--pg-level', '--min-pr-mz', '--max-pr-mz', '--min-fr-mz', '--max-fr-mz', - '--monitor-mod', '--var-mod', '--fixed-mod', + '--monitor-mod', '--var-mod', '--fixed-mod', '--dda', '--channels', '--lib-fixed-mod', '--original-mods'] // Sort by length descending so longer flags (e.g. --mass-acc-ms1) are matched before shorter prefixes (--mass-acc) blocked.sort { a -> -a.length() }.each { flag -> @@ -82,6 +83,11 @@ process INDIVIDUAL_ANALYSIS { diann_no_peptidoforms = params.diann_no_peptidoforms ? "--no-peptidoforms" : "" diann_tims_sum = params.diann_tims_sum ? "--quant-tims-sum" : "" diann_im_window = params.diann_im_window ? "--im-window $params.diann_im_window" : "" + diann_dda_flag = meta.acquisition_method == 'dda' ? "--dda" : "" + + // Flags removed in DIA-NN 2.3.x — only pass for older versions + no_ifs_removal = VersionUtils.versionLessThan(params.diann_version, '2.3') ? "--no-ifs-removal" : "" + no_main_report = VersionUtils.versionLessThan(params.diann_version, '2.3') ? "--no-main-report" : "" // Per-file scan ranges from SDRF (empty = no flag, DIA-NN auto-detects) min_pr_mz = meta['ms1minmz'] ? "--min-pr-mz ${meta['ms1minmz']}" : "" @@ -102,8 +108,8 @@ process INDIVIDUAL_ANALYSIS { --mass-acc ${mass_acc_ms2} \\ --mass-acc-ms1 ${mass_acc_ms1} \\ --window ${scan_window} \\ - --no-ifs-removal \\ - --no-main-report \\ + ${no_ifs_removal} \\ + ${no_main_report} \\ --relaxed-prot-inf \\ --pg-level $params.pg_level \\ ${min_pr_mz} \\ @@ -113,6 +119,7 @@ process INDIVIDUAL_ANALYSIS { ${diann_no_peptidoforms} \\ ${diann_tims_sum} \\ ${diann_im_window} \\ + ${diann_dda_flag} \\ \${mod_flags} \\ $args diff --git a/modules/local/diann/insilico_library_generation/main.nf b/modules/local/diann/insilico_library_generation/main.nf index b347483..66bca5e 100644 --- a/modules/local/diann/insilico_library_generation/main.nf +++ b/modules/local/diann/insilico_library_generation/main.nf @@ -2,6 +2,7 @@ process INSILICO_LIBRARY_GENERATION { tag "$fasta.name" label 'process_medium' label 'diann' + label 'error_retry' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://containers.biocontainers.pro/s3/SingImgsRepo/diann/v1.8.1_cv1/diann_v1.8.1_cv1.img' : @@ -10,6 +11,7 @@ process INSILICO_LIBRARY_GENERATION { input: path(fasta) path(diann_config) + val(is_dda) output: path "versions.yml", emit: versions @@ -29,7 +31,8 @@ process INSILICO_LIBRARY_GENERATION { '--missed-cleavages', '--min-pep-len', '--max-pep-len', '--min-pr-charge', '--max-pr-charge', '--var-mods', '--min-pr-mz', '--max-pr-mz', '--min-fr-mz', '--max-fr-mz', - '--met-excision', '--monitor-mod'] + '--met-excision', '--monitor-mod', '--dda', '--light-models', + '--infin-dia', '--pre-select'] // Sort by length descending so longer flags (e.g. --fasta-search) are matched before shorter prefixes (--fasta, --f) blocked.sort { a -> -a.length() }.each { flag -> def flagPattern = '(?<=^|\\s)' + java.util.regex.Pattern.quote(flag) + '(?=\\s|\$)(\\s+(?!-{1,2}[a-zA-Z])\\S+)*' @@ -45,6 +48,10 @@ process INSILICO_LIBRARY_GENERATION { max_fr_mz = params.max_fr_mz ? "--max-fr-mz $params.max_fr_mz":"" met_excision = params.met_excision ? "--met-excision" : "" diann_no_peptidoforms = params.diann_no_peptidoforms ? "--no-peptidoforms" : "" + diann_dda_flag = is_dda ? "--dda" : "" + diann_light_models = params.diann_light_models ? "--light-models" : "" + infin_dia_flag = params.enable_infin_dia ? "--infin-dia" : "" + pre_select_flag = (params.enable_infin_dia && params.diann_pre_select) ? "--pre-select $params.diann_pre_select" : "" """ diann `cat ${diann_config}` \\ @@ -65,7 +72,11 @@ process INSILICO_LIBRARY_GENERATION { --verbose $params.diann_debug \\ --gen-spec-lib \\ ${diann_no_peptidoforms} \\ + ${diann_light_models} \\ + ${infin_dia_flag} \\ + ${pre_select_flag} \\ ${met_excision} \\ + ${diann_dda_flag} \\ ${args} cp *lib.log.txt silicolibrarygeneration.log diff --git a/modules/local/diann/insilico_library_generation/meta.yml b/modules/local/diann/insilico_library_generation/meta.yml index 5f9d68b..a6185d7 100644 --- a/modules/local/diann/insilico_library_generation/meta.yml +++ b/modules/local/diann/insilico_library_generation/meta.yml @@ -19,6 +19,9 @@ input: type: file description: specifies a configuration file to load options/commands from. pattern: "*.cfg" + - is_dda: + type: boolean + description: Whether DDA mode is enabled (auto-detected from SDRF or set via --diann_dda) output: - predict_speclib: type: file diff --git a/modules/local/diann/preliminary_analysis/main.nf b/modules/local/diann/preliminary_analysis/main.nf index 8bb818b..f9a3508 100644 --- a/modules/local/diann/preliminary_analysis/main.nf +++ b/modules/local/diann/preliminary_analysis/main.nf @@ -2,6 +2,7 @@ process PRELIMINARY_ANALYSIS { tag "$ms_file.baseName" label 'process_high' label 'diann' + label 'error_retry' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://containers.biocontainers.pro/s3/SingImgsRepo/diann/v1.8.1_cv1/diann_v1.8.1_cv1.img' : @@ -27,7 +28,7 @@ process PRELIMINARY_ANALYSIS { '--mass-acc', '--mass-acc-ms1', '--window', '--quick-mass-acc', '--min-corr', '--corr-diff', '--time-corr-only', '--min-pr-mz', '--max-pr-mz', '--min-fr-mz', '--max-fr-mz', - '--monitor-mod', '--var-mod', '--fixed-mod', '--no-prot-inf', + '--monitor-mod', '--var-mod', '--fixed-mod', '--no-prot-inf', '--dda', '--channels', '--lib-fixed-mod', '--original-mods'] // Sort by length descending so longer flags (e.g. --mass-acc-ms1) are matched before shorter prefixes (--mass-acc) blocked.sort { a -> -a.length() }.each { flag -> @@ -67,6 +68,7 @@ process PRELIMINARY_ANALYSIS { scan_window = params.scan_window_automatic ? '' : "--window $params.scan_window" diann_tims_sum = params.diann_tims_sum ? "--quant-tims-sum" : "" diann_im_window = params.diann_im_window ? "--im-window $params.diann_im_window" : "" + diann_dda_flag = meta.acquisition_method == 'dda' ? "--dda" : "" // Per-file scan ranges from SDRF (empty = no flag, DIA-NN auto-detects) min_pr_mz = meta['ms1minmz'] ? "--min-pr-mz ${meta['ms1minmz']}" : "" @@ -102,6 +104,7 @@ process PRELIMINARY_ANALYSIS { ${diann_tims_sum} \\ ${diann_im_window} \\ --no-prot-inf \\ + ${diann_dda_flag} \\ \${mod_flags} \\ $args diff --git a/modules/local/pmultiqc/meta.yml b/modules/local/pmultiqc/meta.yml index adf63f2..fcca33e 100644 --- a/modules/local/pmultiqc/meta.yml +++ b/modules/local/pmultiqc/meta.yml @@ -23,7 +23,7 @@ output: pattern: "*.html" - quantmsdb: type: file - description: Sqlite3 database file stored protein psm and quantification information + description: SQLite3 database file that stores protein, PSM, and quantification information pattern: "*.db" - data: type: dir diff --git a/modules/local/samplesheet_check/main.nf b/modules/local/samplesheet_check/main.nf index f2b7112..76fae90 100644 --- a/modules/local/samplesheet_check/main.nf +++ b/modules/local/samplesheet_check/main.nf @@ -23,6 +23,7 @@ process SAMPLESHEET_CHECK { def string_use_ols_cache_only = params.use_ols_cache_only == true ? "--use_ols_cache_only" : "" """ + set -o pipefail # Get basename and create output filename BASENAME=\$(basename "${input_file}") # Remove .sdrf.tsv, .sdrf.csv, or .sdrf extension (in that order to match longest first) diff --git a/modules/local/sdrf_parsing/main.nf b/modules/local/sdrf_parsing/main.nf index fabc1a8..1ae0c3b 100644 --- a/modules/local/sdrf_parsing/main.nf +++ b/modules/local/sdrf_parsing/main.nf @@ -22,6 +22,7 @@ process SDRF_PARSING { def diann_version_flag = params.diann_version ? "--diann_version '${params.diann_version}'" : '' """ + set -o pipefail parse_sdrf convert-diann \\ -s ${sdrf} \\ ${mod_loc_flag} \\ diff --git a/modules/local/sdrf_parsing/meta.yml b/modules/local/sdrf_parsing/meta.yml index 7c311f4..846cfaa 100644 --- a/modules/local/sdrf_parsing/meta.yml +++ b/modules/local/sdrf_parsing/meta.yml @@ -28,7 +28,7 @@ output: type: file description: log file pattern: "*.log" - - version: + - versions: type: file description: File containing software version pattern: "versions.yml" diff --git a/modules/local/utils/decompress_dotd/meta.yml b/modules/local/utils/decompress_dotd/meta.yml index bbc7c58..55330d3 100644 --- a/modules/local/utils/decompress_dotd/meta.yml +++ b/modules/local/utils/decompress_dotd/meta.yml @@ -22,7 +22,7 @@ input: type: file description: | Bruker Raw file archived using tar - pattern: "*.{d.tar,.tar,.gz,.d.tar.gz}" + pattern: "*.{d.tar,tar,gz,d.tar.gz}" output: - meta: type: map diff --git a/modules/local/utils/tdf2mzml/main.nf b/modules/local/utils/tdf2mzml/main.nf deleted file mode 100644 index a242935..0000000 --- a/modules/local/utils/tdf2mzml/main.nf +++ /dev/null @@ -1,38 +0,0 @@ -process TDF2MZML { - tag "$meta.id" - label 'process_single' - label 'error_retry' - - container 'quay.io/bigbio/tdf2mzml:latest' // TODO: pin to a specific version tag for reproducibility - - input: - tuple val(meta), path(rawfile) - - output: - tuple val(meta), path("*.mzML"), emit: mzmls_converted - path "versions.yml", emit: versions - path "*.log", emit: log - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - - """ - echo "Converting..." | tee --append ${rawfile.baseName}_conversion.log - tdf2mzml.py -i *.d $args 2>&1 | tee --append ${rawfile.baseName}_conversion.log - - # Rename .mzml to .mzML via temp file to handle case-insensitive filesystems (e.g. macOS) - mv *.mzml __tmp_converted.mzML && mv __tmp_converted.mzML ${file(rawfile.baseName).baseName}.mzML - - # Rename .d directory only if the name differs (avoid 'same file' error) - target_d="${file(rawfile.baseName).baseName}.d" - if [ ! -d "\${target_d}" ]; then - mv *.d "\${target_d}" - fi - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - tdf2mzml.py: \$(tdf2mzml.py --version) - END_VERSIONS - """ -} diff --git a/modules/local/utils/tdf2mzml/meta.yml b/modules/local/utils/tdf2mzml/meta.yml deleted file mode 100644 index ebb90b8..0000000 --- a/modules/local/utils/tdf2mzml/meta.yml +++ /dev/null @@ -1,42 +0,0 @@ -name: tdf2mzml -description: convert raw bruker files to mzml files -keywords: - - raw - - mzML - - .d -tools: - - tdf2mzml: - description: | - It takes a bruker .d raw file as input and outputs indexed mzML - homepage: https://github.com/mafreitas/tdf2mzml - documentation: https://github.com/mafreitas/tdf2mzml -input: - - meta: - type: map - description: | - Groovy Map containing sample information - - rawfile: - type: file - description: | - Bruker .d raw directory - pattern: "*.d" -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'MD5', enzyme:trypsin ] - - mzml: - type: file - description: indexed mzML - pattern: "*.mzML" - - log: - type: file - description: log file - pattern: "*.log" - - version: - type: file - description: File containing software version - pattern: "versions.yml" -authors: - - "@jspaezp" diff --git a/nextflow.config b/nextflow.config index 33c5c95..a8fbb0d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -52,6 +52,14 @@ params { diann_debug = 3 diann_speclib = null diann_extra_args = null + diann_dda = false // Fallback: explicitly enable DDA when SDRF lacks acquisition method (requires DIA-NN >= 2.3.2) + diann_light_models = false // add '--light-models' for 10x faster library generation (DIA-NN >= 2.0) + diann_export_quant = false // add '--export-quant' for fragment-level parquet export (DIA-NN >= 2.0) + diann_site_ms1_quant = false // add '--site-ms1-quant' for MS1 apex PTM quantification (DIA-NN >= 2.0) + + // DIA-NN: InfinDIA (experimental, v2.3.0+) + enable_infin_dia = false // Enable InfinDIA for ultra-large search spaces + diann_pre_select = null // --pre-select N precursor limit for InfinDIA // Optional outputs — control which intermediate files are published save_speclib_tsv = false // Save the TSV spectral library from in-silico generation @@ -93,6 +101,8 @@ params { quantums_params = null diann_no_peptidoforms = false // add '--no-peptidoforms' diann_use_quant = true // add '--use-quant' to FINAL_QUANTIFICATION + diann_channel_run_norm = false // add '--channel-run-norm' to FINAL_QUANTIFICATION + diann_channel_spec_norm = false // add '--channel-spec-norm' to FINAL_QUANTIFICATION // pmultiqc options enable_pmultiqc = true @@ -237,10 +247,13 @@ profiles { test_dia_2_2_0 { includeConfig 'conf/tests/test_dia_2_2_0.config' } test_latest_dia { includeConfig 'conf/tests/test_latest_dia.config' } test_full_dia { includeConfig 'conf/tests/test_full_dia.config' } + test_dda { includeConfig 'conf/tests/test_dda.config' } + test_dia_skip_preanalysis { includeConfig 'conf/tests/test_dia_skip_preanalysis.config' } // DIA-NN version overrides (used by merge_ci.yml matrix) diann_v1_8_1 { includeConfig 'conf/diann_versions/v1_8_1.config' } diann_v2_1_0 { includeConfig 'conf/diann_versions/v2_1_0.config' } diann_v2_2_0 { includeConfig 'conf/diann_versions/v2_2_0.config' } + diann_v2_3_2 { includeConfig 'conf/diann_versions/v2_3_2.config' } dev { includeConfig 'conf/dev.config' } pride_slurm { includeConfig 'conf/pride_codon_slurm.config' } manual_wave { includeConfig 'conf/wave.config' } @@ -362,7 +375,7 @@ manifest { mainScript = 'main.nf' defaultBranch = 'main' nextflowVersion = '!>=25.04.0' - version = '1.0.0' + version = '2.0.0dev' doi = '10.5281/zenodo.15573386' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 8708cf7..9fe8f21 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -411,6 +411,19 @@ "fa_icon": "far fa-check-square", "default": true }, + + "diann_channel_run_norm": { + "type": "boolean", + "description": "Set '--channel-run-norm'. Run-specific normalisation for multiplexing (e.g., Protein Turnover SILAC)", + "fa_icon": "far fa-check-square", + "default": false + }, + "diann_channel_spec_norm": { + "type": "boolean", + "description": "Set '--channel-spec-norm'. Channel-specific normalisation for multiplexing (Independent Samples)", + "fa_icon": "far fa-check-square", + "default": false + }, "skip_preliminary_analysis": { "type": "boolean", "description": "Skip the preliminary analysis step, thus use the passed spectral library as-is instead of generating a local consensus library.", @@ -456,6 +469,41 @@ "hidden": false, "help_text": "Pass additional DIA-NN command-line arguments that will be appended to all DIA-NN steps (INSILICO_LIBRARY_GENERATION, PRELIMINARY_ANALYSIS, ASSEMBLE_EMPIRICAL_LIBRARY, INDIVIDUAL_ANALYSIS, FINAL_QUANTIFICATION). Flags that conflict with a specific step are automatically stripped with a warning. For step-specific overrides, use custom Nextflow config files with ext.args." }, + "diann_dda": { + "type": "boolean", + "description": "Explicitly enable DDA mode. Normally auto-detected from the SDRF acquisition method column. Use only when SDRF lacks this column. Requires DIA-NN >= 2.3.2.", + "fa_icon": "fas fa-flask", + "default": false + }, + "diann_light_models": { + "type": "boolean", + "description": "Enable --light-models for 10x faster in-silico library generation (DIA-NN >= 2.0).", + "fa_icon": "fas fa-bolt", + "default": false + }, + "diann_export_quant": { + "type": "boolean", + "description": "Enable --export-quant for fragment-level parquet data export (DIA-NN >= 2.0).", + "fa_icon": "fas fa-file-export", + "default": false + }, + "diann_site_ms1_quant": { + "type": "boolean", + "description": "Enable --site-ms1-quant to use MS1 apex intensities for PTM site quantification (DIA-NN >= 2.0).", + "fa_icon": "fas fa-crosshairs", + "default": false + }, + "enable_infin_dia": { + "type": "boolean", + "description": "Enable InfinDIA for ultra-large search spaces (DIA-NN >= 2.3.0). Experimental.", + "fa_icon": "fas fa-infinity", + "default": false + }, + "diann_pre_select": { + "type": "integer", + "description": "Set --pre-select N precursor limit for InfinDIA pre-search.", + "fa_icon": "fas fa-filter" + }, "save_speclib_tsv": { "type": "boolean", "default": false, diff --git a/subworkflows/local/create_input_channel/main.nf b/subworkflows/local/create_input_channel/main.nf index 588e144..2208eca 100644 --- a/subworkflows/local/create_input_channel/main.nf +++ b/subworkflows/local/create_input_channel/main.nf @@ -29,8 +29,23 @@ workflow CREATE_INPUT_CHANNEL { .combine(ch_expdesign) .splitCsv(header: true, sep: '\t') .map { experiment_id, row -> + def filestr + if (!params.root_folder) { + filestr = row.URI?.toString()?.trim() ? row.URI.toString() : row.Filename.toString() + } else { + filestr = row.Filename.toString() + filestr = params.root_folder + File.separator + filestr + filestr = (params.local_input_type + ? filestr.take(filestr.lastIndexOf('.')) + '.' + params.local_input_type + : filestr) + } + return [filestr, experiment_id, row] + } + .groupTuple(by: 0) + .map { filestr, experiment_ids, rows -> + def experiment_id = experiment_ids[0] def wrapper = [acquisition_method: "", experiment_id: experiment_id] - create_meta_channel(row, enzymes, files, wrapper) + return create_meta_channel_grouped(filestr, rows, wrapper) } .set { ch_meta_config_dia } @@ -42,66 +57,63 @@ workflow CREATE_INPUT_CHANNEL { } // Function to get list of [meta, [ spectra_files ]] -def create_meta_channel(LinkedHashMap row, enzymes, files, wrapper) { +def create_meta_channel_grouped(String filestr, List rows, Map wrapper) { def meta = [:] - def filestr - // Always use SDRF format - if (!params.root_folder) { - filestr = row.URI?.toString()?.trim() ? row.URI.toString() : row.Filename.toString() - } - else { - filestr = row.Filename.toString() - } + def base_row = rows[0] def fileName = file(filestr).name def dotIndex = fileName.lastIndexOf('.') meta.id = dotIndex > 0 ? fileName.take(dotIndex) : fileName meta.experiment_id = wrapper.experiment_id - // apply transformations given by specified root_folder and type - if (params.root_folder) { - filestr = params.root_folder + File.separator + filestr - filestr = (params.local_input_type - ? filestr.take(filestr.lastIndexOf('.')) + '.' + params.local_input_type - : filestr) - } - // existence check if (!file(filestr).exists()) { exit(1, "ERROR: Please check input file -> File Uri does not exist!\n${filestr}") } - // Validate acquisition method is DIA - // AcquisitionMethod is already extracted by convert-diann (e.g. "Data-Independent Acquisition") - def acqMethod = row.AcquisitionMethod?.toString()?.trim() ?: "" + // Detect acquisition method from SDRF or fallback to --diann_dda param + def acqMethod = base_row.AcquisitionMethod?.toString()?.trim() ?: "" if (acqMethod.toLowerCase().contains("data-independent acquisition") || acqMethod.toLowerCase().contains("dia")) { meta.acquisition_method = "dia" - } - else if (acqMethod.isEmpty()) { - // If no acquisition method column in SDRF, assume DIA (this is a DIA-only pipeline) - meta.acquisition_method = "dia" - } - else { - log.error("This pipeline only supports Data-Independent Acquisition (DIA). Found: '${acqMethod}'. Use the quantms pipeline for DDA workflows.") + } else if (acqMethod.toLowerCase().contains("data-dependent acquisition") || acqMethod.toLowerCase().contains("dda")) { + meta.acquisition_method = "dda" + } else if (acqMethod.isEmpty()) { + meta.acquisition_method = params.diann_dda ? "dda" : "dia" + } else { + log.error("Unsupported acquisition method: '${acqMethod}'. This pipeline supports DIA and DDA. Found in file: ${filestr}") exit(1) } - // DissociationMethod is already normalized by convert-diann (HCD, CID, ETD, ECD) - meta.dissociationmethod = row.DissociationMethod?.toString()?.trim() ?: "" - + meta.dissociationmethod = base_row.DissociationMethod?.toString()?.trim() ?: "" wrapper.acquisition_method = meta.acquisition_method - // Validate required SDRF columns - these parameters are exclusively read from SDRF (no command-line override) + def labels = rows.collect { it.Label?.toString()?.trim() }.findAll { it }.unique() + meta.labelling_type = labels.join(';') + + def is_plexdia = labels.size() > 1 || (labels.size() == 1 && !labels[0].toLowerCase().contains("label free")) + meta.plexdia = is_plexdia + + def enzymes = rows.collect { it.Enzyme?.toString()?.trim() }.findAll { it }.unique() + if (enzymes.size() > 1) { + log.error("Currently only one enzyme is supported per file. Found conflicting enzymes for ${filestr}: '${enzymes}'.") + exit(1) + } + meta.enzyme = enzymes ? enzymes[0] : null + + def fixedMods = rows.collect { it.FixedModifications?.toString()?.trim() }.findAll { it }.unique() + meta.fixedmodifications = fixedMods ? fixedMods[0] : null + + // Validate required SDRF columns def requiredColumns = [ - 'Label': row.Label, - 'Enzyme': row.Enzyme, - 'FixedModifications': row.FixedModifications + 'Label': meta.labelling_type, + 'Enzyme': meta.enzyme, + 'FixedModifications': meta.fixedmodifications ] def missingColumns = [] requiredColumns.each { colName, colValue -> - if (colValue == null || colValue.toString().trim().isEmpty()) { + if (colValue == null || colValue.toString().isEmpty()) { missingColumns.add(colName) } } @@ -112,20 +124,13 @@ def create_meta_channel(LinkedHashMap row, enzymes, files, wrapper) { exit(1) } - // Set values from SDRF (required columns) - meta.labelling_type = row.Label - meta.fixedmodifications = row.FixedModifications - meta.enzyme = row.Enzyme - - // Set tolerance values: use SDRF if available, otherwise fall back to params def validUnits = ['ppm', 'da', 'Da', 'PPM'] - // Precursor mass tolerance - if (row.PrecursorMassTolerance != null && !row.PrecursorMassTolerance.toString().trim().isEmpty()) { + if (base_row.PrecursorMassTolerance != null && !base_row.PrecursorMassTolerance.toString().trim().isEmpty()) { try { - meta.precursormasstolerance = Double.parseDouble(row.PrecursorMassTolerance) + meta.precursormasstolerance = Double.parseDouble(base_row.PrecursorMassTolerance) } catch (NumberFormatException e) { - log.error("ERROR: Invalid PrecursorMassTolerance value '${row.PrecursorMassTolerance}' for file '${filestr}'. Must be a valid number.") + log.error("ERROR: Invalid PrecursorMassTolerance value '${base_row.PrecursorMassTolerance}' for file '${filestr}'. Must be a valid number.") exit(1) } } else { @@ -133,23 +138,21 @@ def create_meta_channel(LinkedHashMap row, enzymes, files, wrapper) { meta.precursormasstolerance = params.precursor_mass_tolerance } - // Precursor mass tolerance unit - if (row.PrecursorMassToleranceUnit != null && !row.PrecursorMassToleranceUnit.toString().trim().isEmpty()) { - if (!validUnits.any { row.PrecursorMassToleranceUnit.toString().equalsIgnoreCase(it) }) { - log.error("ERROR: Invalid PrecursorMassToleranceUnit '${row.PrecursorMassToleranceUnit}' for file '${filestr}'. Must be 'ppm' or 'Da'.") + if (base_row.PrecursorMassToleranceUnit != null && !base_row.PrecursorMassToleranceUnit.toString().trim().isEmpty()) { + if (!validUnits.any { base_row.PrecursorMassToleranceUnit.toString().equalsIgnoreCase(it) }) { + log.error("ERROR: Invalid PrecursorMassToleranceUnit '${base_row.PrecursorMassToleranceUnit}' for file '${filestr}'. Must be 'ppm' or 'Da'.") exit(1) } - meta.precursormasstoleranceunit = row.PrecursorMassToleranceUnit + meta.precursormasstoleranceunit = base_row.PrecursorMassToleranceUnit } else { meta.precursormasstoleranceunit = params.precursor_mass_tolerance_unit } - // Fragment mass tolerance - if (row.FragmentMassTolerance != null && !row.FragmentMassTolerance.toString().trim().isEmpty()) { + if (base_row.FragmentMassTolerance != null && !base_row.FragmentMassTolerance.toString().trim().isEmpty()) { try { - meta.fragmentmasstolerance = Double.parseDouble(row.FragmentMassTolerance) + meta.fragmentmasstolerance = Double.parseDouble(base_row.FragmentMassTolerance) } catch (NumberFormatException e) { - log.error("ERROR: Invalid FragmentMassTolerance value '${row.FragmentMassTolerance}' for file '${filestr}'. Must be a valid number.") + log.error("ERROR: Invalid FragmentMassTolerance value '${base_row.FragmentMassTolerance}' for file '${filestr}'. Must be a valid number.") exit(1) } } else { @@ -157,43 +160,26 @@ def create_meta_channel(LinkedHashMap row, enzymes, files, wrapper) { meta.fragmentmasstolerance = params.fragment_mass_tolerance } - // Fragment mass tolerance unit - if (row.FragmentMassToleranceUnit != null && !row.FragmentMassToleranceUnit.toString().trim().isEmpty()) { - if (!validUnits.any { row.FragmentMassToleranceUnit.toString().equalsIgnoreCase(it) }) { - log.error("ERROR: Invalid FragmentMassToleranceUnit '${row.FragmentMassToleranceUnit}' for file '${filestr}'. Must be 'ppm' or 'Da'.") + if (base_row.FragmentMassToleranceUnit != null && !base_row.FragmentMassToleranceUnit.toString().trim().isEmpty()) { + if (!validUnits.any { base_row.FragmentMassToleranceUnit.toString().equalsIgnoreCase(it) }) { + log.error("ERROR: Invalid FragmentMassToleranceUnit '${base_row.FragmentMassToleranceUnit}' for file '${filestr}'. Must be 'ppm' or 'Da'.") exit(1) } - meta.fragmentmasstoleranceunit = row.FragmentMassToleranceUnit + meta.fragmentmasstoleranceunit = base_row.FragmentMassToleranceUnit } else { meta.fragmentmasstoleranceunit = params.fragment_mass_tolerance_unit } - // Variable modifications: use SDRF if available, otherwise fall back to params - if (row.VariableModifications != null && !row.VariableModifications.toString().trim().isEmpty()) { - meta.variablemodifications = row.VariableModifications + if (base_row.VariableModifications != null && !base_row.VariableModifications.toString().trim().isEmpty()) { + meta.variablemodifications = base_row.VariableModifications } else { meta.variablemodifications = params.variable_mods } - // Per-file scan ranges (empty string = no flags passed, DIA-NN auto-detects) - meta.ms1minmz = row.MS1MinMz?.toString()?.trim() ?: "" - meta.ms1maxmz = row.MS1MaxMz?.toString()?.trim() ?: "" - meta.ms2minmz = row.MS2MinMz?.toString()?.trim() ?: "" - meta.ms2maxmz = row.MS2MaxMz?.toString()?.trim() ?: "" - - enzymes += row.Enzyme - if (enzymes.size() > 1) { - log.error("Currently only one enzyme is supported for the whole experiment. Specified was '${enzymes}'. Check or split your SDRF.") - log.error(filestr) - exit(1) - } - - // Check for duplicate files - if (filestr in files) { - log.error("Currently only one DIA-NN setting per file is supported for the whole experiment. ${filestr} has multiple entries in your SDRF. Consider splitting your design into multiple experiments.") - exit(1) - } - files += filestr + meta.ms1minmz = base_row.MS1MinMz?.toString()?.trim() ?: "" + meta.ms1maxmz = base_row.MS1MaxMz?.toString()?.trim() ?: "" + meta.ms2minmz = base_row.MS2MinMz?.toString()?.trim() ?: "" + meta.ms2maxmz = base_row.MS2MaxMz?.toString()?.trim() ?: "" return [meta, filestr] } diff --git a/subworkflows/local/file_preparation/meta.yml b/subworkflows/local/file_preparation/meta.yml index 54211c7..54d34fc 100644 --- a/subworkflows/local/file_preparation/meta.yml +++ b/subworkflows/local/file_preparation/meta.yml @@ -8,7 +8,6 @@ keywords: - proteomics components: - thermorawfileparser - - tdf2mzml - decompress - mzml/indexing - mzml/statistics diff --git a/subworkflows/local/input_check/meta.yml b/subworkflows/local/input_check/meta.yml index abe2c7f..1f2cefb 100644 --- a/subworkflows/local/input_check/meta.yml +++ b/subworkflows/local/input_check/meta.yml @@ -14,7 +14,7 @@ input: description: | Input file to be validated output: - - ch_input_file: + - input_file: type: file description: | Channel containing validated input files diff --git a/subworkflows/local/utils_nfcore_quantms_pipeline/meta.yml b/subworkflows/local/utils_nfcore_quantms_pipeline/meta.yml index 06365ae..cf1fd6d 100644 --- a/subworkflows/local/utils_nfcore_quantms_pipeline/meta.yml +++ b/subworkflows/local/utils_nfcore_quantms_pipeline/meta.yml @@ -4,7 +4,7 @@ description: Pipeline completion utilities for the nf-core quantmsdiann pipeline keywords: - utils - nf-core - - quantms + - quantmsdiann components: - completionemail - completionsummary diff --git a/workflows/dia.nf b/workflows/dia.nf index 8a1b13b..69dd78d 100644 --- a/workflows/dia.nf +++ b/workflows/dia.nf @@ -34,7 +34,29 @@ workflow DIA { main: ch_software_versions = channel.empty() - ch_searchdb = channel.fromPath(params.database, checkIfExists: true).first() + + // Version guard for DDA mode (when explicitly set via param) + if (params.diann_dda && VersionUtils.versionLessThan(params.diann_version, '2.3.2')) { + error("DDA mode (--diann_dda) requires DIA-NN >= 2.3.2. Current version: ${params.diann_version}. Use -profile diann_v2_3_2") + } + + // Version guard for InfinDIA + if (params.enable_infin_dia && VersionUtils.versionLessThan(params.diann_version, '2.3.0')) { + error("InfinDIA requires DIA-NN >= 2.3.0. Current version: ${params.diann_version}. Use -profile diann_v2_3_2") + } + + // Version guard for DIA-NN 2.0+ features + if ((params.diann_light_models || params.diann_export_quant || params.diann_site_ms1_quant) && VersionUtils.versionLessThan(params.diann_version, '2.0')) { + def enabled = [] + if (params.diann_light_models) enabled << '--light-models' + if (params.diann_export_quant) enabled << '--export-quant' + if (params.diann_site_ms1_quant) enabled << '--site-ms1-quant' + error("${enabled.join(', ')} require DIA-NN >= 2.0. Current version: ${params.diann_version}. Use -profile diann_v2_1_0 or later") + } + + ch_searchdb = channel.fromPath(params.database, checkIfExists: true) + .ifEmpty { error("No protein database found at '${params.database}'. Provide --database ") } + .first() ch_file_preparation_results.multiMap { result -> @@ -42,7 +64,18 @@ workflow DIA { ms_file:result[1] }.set { ch_result } - ch_experiment_meta = ch_result.meta.unique { m -> m.experiment_id }.first() + ch_experiment_meta = ch_result.meta.unique { m -> m.experiment_id } + .ifEmpty { error("No valid input files found after SDRF parsing. Check your SDRF file and input paths.") } + .first() + + // Determine DDA mode: true if explicitly set via param OR auto-detected from SDRF + ch_is_dda = ch_experiment_meta.map { meta -> + def dda = params.diann_dda || meta.acquisition_method == 'dda' + if (dda && VersionUtils.versionLessThan(params.diann_version, '2.3.2')) { + error("DDA mode (detected from SDRF) requires DIA-NN >= 2.3.2. Current version: ${params.diann_version}. Use -profile diann_v2_3_2") + } + return dda + } // diann_config.cfg comes directly from SDRF_PARSING (convert-diann) // Use as value channel so it can be consumed by all per-file processes @@ -54,7 +87,7 @@ workflow DIA { if (params.diann_speclib != null && params.diann_speclib.toString() != "") { speclib = channel.from(file(params.diann_speclib, checkIfExists: true)) } else { - INSILICO_LIBRARY_GENERATION(ch_searchdb, ch_diann_cfg_val) + INSILICO_LIBRARY_GENERATION(ch_searchdb, ch_diann_cfg_val, ch_is_dda) speclib = INSILICO_LIBRARY_GENERATION.out.predict_speclib } diff --git a/workflows/quantmsdiann.nf b/workflows/quantmsdiann.nf index 079858a..e55a1e3 100644 --- a/workflows/quantmsdiann.nf +++ b/workflows/quantmsdiann.nf @@ -60,7 +60,7 @@ workflow QUANTMSDIANN { FILE_PREPARATION.out.results .branch { item -> - dia: item[0].acquisition_method.toLowerCase().contains("dia") + dia: item[0].acquisition_method.toLowerCase().contains("dia") || item[0].acquisition_method.toLowerCase().contains("dda") } .set { ch_fileprep_result } //