add alternative IDs, update kraken module and readme [ci skip]

emilyncosta · Jan 23, 2024 · 8f32c8e · 8f32c8e
1 parent 8ffbc6d
commit 8f32c8e
Show file tree

Hide file tree

Showing 10 changed files with 388 additions and 21 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,5 @@
-[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)
+[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.8220379?labelColor=000000)](https://doi.org/10.5281/zenodo.8220379)
+
 
 [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A522.10.1-23aa62.svg)](https://www.nextflow.io/)
 [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)
@@ -8,20 +9,18 @@
 
 ## Introduction
 
-**UNIFESP_ntm_newspecies_nf** is a bioinformatics pipeline used for the paper entitled "Description of new species of Mycobacterium terrae complex isolated from sewage at the São Paulo Zoological Park Foundation in Brazil".
+**UNIFESP_ntm_newspecies_nf** is a bioinformatics pipeline used for the paper entitled [Description of new species of Mycobacterium terrae complex isolated from sewage at the São Paulo Zoological Park Foundation in Brazil](https://doi.org/10.3389/fmicb.2024.1335985)
 
-<!-- TODO nf-core:
-   Complete this sentence with a 2-3 sentence summary of what types of data the pipeline ingests, a brief overview of the
-   major pipeline sections and the types of output it produces. You're giving an overview to someone new
-   to nf-core here, in 15-20 seconds. For an example, see https://github.com/nf-core/rnaseq/blob/master/README.md#introduction
--->
 
-<!-- TODO nf-core: Include a figure that guides the user through the major workflow steps. Many nf-core
-     workflows use the "tube map" design for that. See https://nf-co.re/docs/contributing/design_guidelines#examples for examples.   -->
-<!-- TODO nf-core: Fill in short bullet-pointed list of the default steps in the pipeline -->
+## IDs for the submitted FASTQ samples
 
-1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))
-2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))
+| Sample ID |  ATCC ID | JCM ID |        Nomenclature       | Nomenclature alias (Gupta et a. 2018) | GenBanK 16S rRNA gene |  Bioproject |   Biosample  |     SRA     | GenBank Draft Genome | PGAP Annotation |
+|:---------:|:--------:|:------:|:-------------------------:|:-------------------------------------:|:---------------------:|:-----------:|:------------:|:-----------:|:--------------------:|:---------------:|
+| MYC017    | TSD-296T | 35364T | Mycobacterium vasticus    | Mycolicibacter vasticus               | MK890459.1            | PRJNA755977 | SAMN20959233 | SRR27405758 | CP084028             | JAYJJQ000000000 |
+| MYC098    | TSD-297T | 35365T | Mycobacterium crassicus   | Mycolicibacter crassicus              | MK890478.1            | PRJNA757362 | SAMN20959234 | SRR27405950 | CP084029             | JAYJJR000000000 |
+| MYC101    | TSD-298T | 35366T | Mycobacterium zoologicum  | Mycolicibacter zoologicum             | MK890479.1            | PRJNA757364 | SAMN20959235 | SRR27405954 | CP084030             | JAYJJS000000000 |
+| MYC123    | BAA3216  | 35367  | Mycobacterium zoologicum  | Mycolicibacter zoologicum             | MK890481.1            | PRJNA743883 | SAMN20062777 | SRR27406169 | CP083985             | JAYJJT000000000 |
+| MYC340    | TSD-299T | 35368T | Mycobacterium nativiensis | Mycolicibacter nativiensis            | MK890521.1            | PRJNA743885 | SAMN20062778 | SRR27406220 | CP083986             | JAYJJU000000000 |
 
 ## Usage
 
@@ -51,8 +50,8 @@ Now, you can run the pipeline using:
 <!-- TODO nf-core: update the following command to include all required parameters for a minimal example -->
 
 ```bash
-nextflow run UNIFESP_LABMICOBACT/ntm_mterrae_nf \
-   -profile <docker/singularity/.../institute> \
+nextflow run emilyncosta/UNIFESP_ntm_newspecies_nf \
+   -profile docker \
    --input samplesheet.csv \
    --outdir <OUTDIR>
 ```

diff --git a/main.nf b/main.nf
@@ -22,13 +22,13 @@ WorkflowMain.initialise(workflow, params, log)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
 
-include { NTM_MTERRAE_NF } from './workflows/ntm_mterrae_nf'
+include { NTM_NEWSPECIES_NF } from './workflows/ntm_mterrae_nf'
 
 //
 // WORKFLOW: Run main UNIFESP_LABMICOBACT/ntm_mterrae_nf analysis pipeline
 //
 workflow UNIFESP_LABMICOBACT {
-    NTM_MTERRAE_NF ()
+    NTM_NEWSPECIES_NF ()
 }
 
 /*

diff --git a/modules.json b/modules.json
@@ -20,6 +20,11 @@
             "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
             "installed_by": ["modules"]
           },
+          "kraken2/kraken2": {
+            "branch": "master",
+            "git_sha": "8fae4ee738f645812384d6aba9d0dc651ad79ff9",
+            "installed_by": ["modules"]
+          },
           "multiqc": {
             "branch": "master",
             "git_sha": "f2d63bd5b68925f98f572eed70993d205cc694b7",

diff --git a/modules/nf-core/kraken2/kraken2/environment.yml b/modules/nf-core/kraken2/kraken2/environment.yml
@@ -0,0 +1,8 @@
+name: kraken2_kraken2
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::kraken2=2.1.2
+  - conda-forge::pigz=2.6
diff --git a/modules/nf-core/kraken2/kraken2/main.nf b/modules/nf-core/kraken2/kraken2/main.nf
@@ -0,0 +1,58 @@
+process KRAKEN2_KRAKEN2 {
+    tag "$meta.id"
+    label 'process_high'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' :
+        'biocontainers/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' }"
+
+    input:
+    tuple val(meta), path(reads)
+    path  db
+    val save_output_fastqs
+    val save_reads_assignment
+
+    output:
+    tuple val(meta), path('*.classified{.,_}*')     , optional:true, emit: classified_reads_fastq
+    tuple val(meta), path('*.unclassified{.,_}*')   , optional:true, emit: unclassified_reads_fastq
+    tuple val(meta), path('*classifiedreads.txt')   , optional:true, emit: classified_reads_assignment
+    tuple val(meta), path('*report.txt')                           , emit: report
+    path "versions.yml"                                            , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def paired       = meta.single_end ? "" : "--paired"
+    def classified   = meta.single_end ? "${prefix}.classified.fastq"   : "${prefix}.classified#.fastq"
+    def unclassified = meta.single_end ? "${prefix}.unclassified.fastq" : "${prefix}.unclassified#.fastq"
+    def classified_option = save_output_fastqs ? "--classified-out ${classified}" : ""
+    def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : ""
+    def readclassification_option = save_reads_assignment ? "--output ${prefix}.kraken2.classifiedreads.txt" : "--output /dev/null"
+    def compress_reads_command = save_output_fastqs ? "pigz -p $task.cpus *.fastq" : ""
+
+    """
+    kraken2 \\
+        --db $db \\
+        --threads $task.cpus \\
+        --report ${prefix}.kraken2.report.txt \\
+        --gzip-compressed \\
+        $unclassified_option \\
+        $classified_option \\
+        $readclassification_option \\
+        $paired \\
+        $args \\
+        $reads
+
+    $compress_reads_command
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        kraken2: \$(echo \$(kraken2 --version 2>&1) | sed 's/^.*Kraken version //; s/ .*\$//')
+        pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' )
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/kraken2/kraken2/meta.yml b/modules/nf-core/kraken2/kraken2/meta.yml
@@ -0,0 +1,78 @@
+name: kraken2_kraken2
+description: Classifies metagenomic sequence data
+keywords:
+  - classify
+  - metagenomics
+  - fastq
+  - db
+tools:
+  - kraken2:
+      description: |
+        Kraken2 is a taxonomic sequence classifier that assigns taxonomic labels to sequence reads
+      homepage: https://ccb.jhu.edu/software/kraken2/
+      documentation: https://github.com/DerrickWood/kraken2/wiki/Manual
+      doi: 10.1186/s13059-019-1891-0
+      licence: ["MIT"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - reads:
+      type: file
+      description: |
+        List of input FastQ files of size 1 and 2 for single-end and paired-end data,
+        respectively.
+  - db:
+      type: directory
+      description: Kraken2 database
+  - save_output_fastqs:
+      type: string
+      description: |
+        If true, optional commands are added to save classified and unclassified reads
+        as fastq files
+  - save_reads_assignment:
+      type: string
+      description: |
+        If true, an optional command is added to save a file reporting the taxonomic
+        classification of each input read
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - classified_reads_fastq:
+      type: file
+      description: |
+        Reads classified as belonging to any of the taxa
+        on the Kraken2 database.
+      pattern: "*{fastq.gz}"
+  - unclassified_reads_fastq:
+      type: file
+      description: |
+        Reads not classified to any of the taxa
+        on the Kraken2 database.
+      pattern: "*{fastq.gz}"
+  - classified_reads_assignment:
+      type: file
+      description: |
+        Kraken2 output file indicating the taxonomic assignment of
+        each input read
+  - report:
+      type: file
+      description: |
+        Kraken2 report containing stats about classified
+        and not classifed reads.
+      pattern: "*.{report.txt}"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@joseespinosa"
+  - "@drpatelh"
+maintainers:
+  - "@joseespinosa"
+  - "@drpatelh"
diff --git a/modules/nf-core/kraken2/kraken2/tests/main.nf.test b/modules/nf-core/kraken2/kraken2/tests/main.nf.test
@@ -0,0 +1,143 @@
+nextflow_process {
+    name "Test Process KRAKEN2_KRAKEN2"
+    script "../main.nf"
+    process "KRAKEN2_KRAKEN2"
+    tag "modules"
+    tag "modules_nfcore"
+    tag "untar"
+    tag "kraken2"
+    tag "kraken2/kraken2"
+
+    setup {
+        run("UNTAR") {
+            script "modules/nf-core/untar/main.nf"
+            process {
+                """
+                input[0] = Channel.of([
+                    [],
+                    file(
+                        params.test_data['sarscov2']['genome']['kraken2_tar_gz'],
+                        checkIfExists: true
+                    )
+                ])
+                """
+            }
+        }
+    }
+
+    test("sarscov2 illumina single end [fastq]") {
+        when {
+            process {
+                """
+                input[0] = [
+                    [ id:'test', single_end:true ], // meta map
+                    [ file(
+                        params.test_data['sarscov2']['illumina']['test_1_fastq_gz'],
+                        checkIfExists: true
+                    )]
+                ]
+                input[1] = UNTAR.out.untar.map{ it[1] }
+                input[2] = true
+                input[3] = false
+                """
+            }
+        }
+
+        then {
+            assertAll(
+            { assert process.success },
+            { assert snapshot(
+                    process.out.report,
+                    process.out.versions,
+                ).match()
+            },
+            { assert process.out.classified_reads_fastq.get(0).get(1) ==~ ".*/test.classified.fastq.gz" },
+            { assert process.out.unclassified_reads_fastq.get(0).get(1) ==~ ".*/test.unclassified.fastq.gz" },
+            )
+        }
+    }
+
+    test("sarscov2 illumina paired end [fastq]") {
+        when {
+            params {
+                outdir   = "$outputDir"
+            }
+
+            process {
+                """
+                input[0] = [
+                    [ id:'test', single_end:false ], // meta map
+                    [
+                        file(
+                            params.test_data['sarscov2']['illumina']['test_1_fastq_gz'],
+                            checkIfExists: true
+                        ),
+                        file(
+                            params.test_data['sarscov2']['illumina']['test_2_fastq_gz'],
+                            checkIfExists: true
+                        )
+
+                    ]
+                ]
+                input[1] = UNTAR.out.untar.map{ it[1] }
+                input[2] = true
+                input[3] = false
+                """
+            }
+        }
+
+        then {
+            assertAll(
+            { assert process.success },
+            { assert snapshot(
+                    process.out.report,
+                    process.out.versions,
+                ).match()
+            },
+            { assert process.out.classified_reads_fastq.get(0).get(1).get(0)
+                ==~ ".*/test.classified_1.fastq.gz" },
+            { assert process.out.classified_reads_fastq.get(0).get(1).get(1)
+                ==~ ".*/test.classified_2.fastq.gz" },
+            { assert process.out.unclassified_reads_fastq.get(0).get(1).get(0)
+                ==~ ".*/test.unclassified_1.fastq.gz" },
+            { assert process.out.unclassified_reads_fastq.get(0).get(1).get(1)
+                ==~ ".*/test.unclassified_2.fastq.gz" },
+            )
+        }
+    }
+
+    test("sarscov2 illumina single end [fastq] + save_reads_assignment") {
+        when {
+            params {
+                outdir   = "$outputDir"
+            }
+
+            process {
+                """
+                input[0] = [
+                    [ id:'test', single_end:true ], // meta map
+                    [ file(
+                        params.test_data['sarscov2']['illumina']['test_1_fastq_gz'],
+                        checkIfExists: true
+                    )]
+                ]
+                input[1] = UNTAR.out.untar.map{ it[1] }
+                input[2] = false
+                input[3] = true
+                """
+            }
+        }
+
+        then {
+            assertAll(
+            { assert process.success },
+            { assert snapshot(
+                    process.out.report,
+                    process.out.classified_reads_assignment,
+                    process.out.versions,
+                ).match()
+            },
+            )
+        }
+    }
+}