From 8d32f1f0652eafe2107776417277b904d6b5c01e Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Wed, 14 Aug 2024 11:18:16 +0100 Subject: [PATCH 1/4] nf-core modules install seqkit/fq2fa --- modules.json | 5 ++ modules/nf-core/seqkit/fq2fa/environment.yml | 7 ++ modules/nf-core/seqkit/fq2fa/main.nf | 48 +++++++++++++ modules/nf-core/seqkit/fq2fa/meta.yml | 44 ++++++++++++ .../nf-core/seqkit/fq2fa/tests/main.nf.test | 56 +++++++++++++++ .../seqkit/fq2fa/tests/main.nf.test.snap | 72 +++++++++++++++++++ modules/nf-core/seqkit/fq2fa/tests/tags.yml | 2 + 7 files changed, 234 insertions(+) create mode 100644 modules/nf-core/seqkit/fq2fa/environment.yml create mode 100644 modules/nf-core/seqkit/fq2fa/main.nf create mode 100644 modules/nf-core/seqkit/fq2fa/meta.yml create mode 100644 modules/nf-core/seqkit/fq2fa/tests/main.nf.test create mode 100644 modules/nf-core/seqkit/fq2fa/tests/main.nf.test.snap create mode 100644 modules/nf-core/seqkit/fq2fa/tests/tags.yml diff --git a/modules.json b/modules.json index 113030b..60d80ef 100644 --- a/modules.json +++ b/modules.json @@ -86,6 +86,11 @@ "git_sha": "6c2309aaec566c0d44a6cf14d4b2d0c51afe2e91", "installed_by": ["modules"] }, + "seqkit/fq2fa": { + "branch": "master", + "git_sha": "03fbf6c89e551bd8d77f3b751fb5c955f75b34c5", + "installed_by": ["modules"] + }, "untar": { "branch": "master", "git_sha": "4e5f4687318f24ba944a13609d3ea6ebd890737d", diff --git a/modules/nf-core/seqkit/fq2fa/environment.yml b/modules/nf-core/seqkit/fq2fa/environment.yml new file mode 100644 index 0000000..aede676 --- /dev/null +++ b/modules/nf-core/seqkit/fq2fa/environment.yml @@ -0,0 +1,7 @@ +name: seqkit_fq2fa +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::seqkit=2.8.1 diff --git a/modules/nf-core/seqkit/fq2fa/main.nf b/modules/nf-core/seqkit/fq2fa/main.nf new file mode 100644 index 0000000..77462ad --- /dev/null +++ b/modules/nf-core/seqkit/fq2fa/main.nf @@ -0,0 +1,48 @@ +process SEQKIT_FQ2FA { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/seqkit:2.8.1--h9ee0642_0' : + 'biocontainers/seqkit:2.8.1--h9ee0642_0' }" + + input: + tuple val(meta), path(fastq) + + output: + tuple val(meta), path("*.fa.gz"), emit: fasta + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + seqkit \\ + fq2fa \\ + $args \\ + -j $task.cpus \\ + -o ${prefix}.fa.gz \\ + $fastq + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqkit: \$( seqkit | sed '3!d; s/Version: //' ) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + echo "" | gzip > ${prefix}.fa.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqkit: \$( seqkit | sed '3!d; s/Version: //' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/seqkit/fq2fa/meta.yml b/modules/nf-core/seqkit/fq2fa/meta.yml new file mode 100644 index 0000000..d0c55b3 --- /dev/null +++ b/modules/nf-core/seqkit/fq2fa/meta.yml @@ -0,0 +1,44 @@ +name: "seqkit_fq2fa" +description: Convert FASTQ to FASTA format +keywords: + - fastq + - fasta + - convert +tools: + - "seqkit": + description: "Cross-platform and ultrafast toolkit for FASTA/Q file manipulation, written by Wei Shen." + homepage: "https://github.com/shenwei356/seqkit" + documentation: "https://bioinf.shenwei.me/seqkit/" + doi: "10.1371/journal.pone.0163962" + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + + - fastq: + type: file + description: Sequence file in fastq format + pattern: "*.{fastq,fq}.gz" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fasta: + type: file + description: Sequence file in fasta format + pattern: "*.{fasta,fa}.gz" + +authors: + - "@d-jch" diff --git a/modules/nf-core/seqkit/fq2fa/tests/main.nf.test b/modules/nf-core/seqkit/fq2fa/tests/main.nf.test new file mode 100644 index 0000000..08f399e --- /dev/null +++ b/modules/nf-core/seqkit/fq2fa/tests/main.nf.test @@ -0,0 +1,56 @@ +nextflow_process { + + name "Test Process SEQKIT_FQ2FA" + script "../main.nf" + process "SEQKIT_FQ2FA" + + tag "modules" + tag "modules_nfcore" + tag "seqkit" + tag "seqkit/fq2fa" + + test("sarscov2 - bam") { + + when { + process { + """ + input[0] = [[ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - bam - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [[ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/seqkit/fq2fa/tests/main.nf.test.snap b/modules/nf-core/seqkit/fq2fa/tests/main.nf.test.snap new file mode 100644 index 0000000..b10ff75 --- /dev/null +++ b/modules/nf-core/seqkit/fq2fa/tests/main.nf.test.snap @@ -0,0 +1,72 @@ +{ + "sarscov2 - bam - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + "versions.yml:md5,70efc6839fd6443ee9116c082a730f72" + ], + "fasta": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,70efc6839fd6443ee9116c082a730f72" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-13T08:56:21.234724552" + }, + "sarscov2 - bam": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fa.gz:md5,f0c5c9110ce19e9ebbc9a6b6baf9e105" + ] + ], + "1": [ + "versions.yml:md5,70efc6839fd6443ee9116c082a730f72" + ], + "fasta": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fa.gz:md5,f0c5c9110ce19e9ebbc9a6b6baf9e105" + ] + ], + "versions": [ + "versions.yml:md5,70efc6839fd6443ee9116c082a730f72" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-13T08:55:54.648865102" + } +} \ No newline at end of file diff --git a/modules/nf-core/seqkit/fq2fa/tests/tags.yml b/modules/nf-core/seqkit/fq2fa/tests/tags.yml new file mode 100644 index 0000000..004f102 --- /dev/null +++ b/modules/nf-core/seqkit/fq2fa/tests/tags.yml @@ -0,0 +1,2 @@ +seqkit/fq2fa: + - "modules/nf-core/seqkit/fq2fa/**" From 33b0247d3ec475298bcc01d60937aa881ecbc50e Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Wed, 14 Aug 2024 11:30:28 +0100 Subject: [PATCH 2/4] nf-core modules install seqtk/subseq --- modules.json | 5 ++ modules/nf-core/seqtk/subseq/environment.yml | 7 +++ modules/nf-core/seqtk/subseq/main.nf | 56 +++++++++++++++++ modules/nf-core/seqtk/subseq/meta.yml | 40 +++++++++++++ .../nf-core/seqtk/subseq/tests/main.nf.test | 59 ++++++++++++++++++ .../seqtk/subseq/tests/main.nf.test.snap | 60 +++++++++++++++++++ .../seqtk/subseq/tests/standard.config | 5 ++ modules/nf-core/seqtk/subseq/tests/tags.yml | 2 + 8 files changed, 234 insertions(+) create mode 100644 modules/nf-core/seqtk/subseq/environment.yml create mode 100644 modules/nf-core/seqtk/subseq/main.nf create mode 100644 modules/nf-core/seqtk/subseq/meta.yml create mode 100644 modules/nf-core/seqtk/subseq/tests/main.nf.test create mode 100644 modules/nf-core/seqtk/subseq/tests/main.nf.test.snap create mode 100644 modules/nf-core/seqtk/subseq/tests/standard.config create mode 100644 modules/nf-core/seqtk/subseq/tests/tags.yml diff --git a/modules.json b/modules.json index 60d80ef..0653c8f 100644 --- a/modules.json +++ b/modules.json @@ -91,6 +91,11 @@ "git_sha": "03fbf6c89e551bd8d77f3b751fb5c955f75b34c5", "installed_by": ["modules"] }, + "seqtk/subseq": { + "branch": "master", + "git_sha": "730f3aee80d5f8d0b5fc532202ac59361414d006", + "installed_by": ["modules"] + }, "untar": { "branch": "master", "git_sha": "4e5f4687318f24ba944a13609d3ea6ebd890737d", diff --git a/modules/nf-core/seqtk/subseq/environment.yml b/modules/nf-core/seqtk/subseq/environment.yml new file mode 100644 index 0000000..7abe364 --- /dev/null +++ b/modules/nf-core/seqtk/subseq/environment.yml @@ -0,0 +1,7 @@ +name: seqtk_subseq +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::seqtk=1.4 diff --git a/modules/nf-core/seqtk/subseq/main.nf b/modules/nf-core/seqtk/subseq/main.nf new file mode 100644 index 0000000..d5caebc --- /dev/null +++ b/modules/nf-core/seqtk/subseq/main.nf @@ -0,0 +1,56 @@ +process SEQTK_SUBSEQ { + tag "$sequences" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/seqtk:1.4--he4a0461_1' : + 'biocontainers/seqtk:1.4--he4a0461_1' }" + + input: + tuple val(meta), path(sequences) + path filter_list + + output: + tuple val(meta), path("*.gz"), emit: sequences + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def ext = "fa" + if ("$sequences" ==~ /.+\.fq|.+\.fq.gz|.+\.fastq|.+\.fastq.gz/) { + ext = "fq" + } + """ + seqtk \\ + subseq \\ + $args \\ + $sequences \\ + $filter_list | \\ + gzip --no-name > ${sequences}${prefix}.${ext}.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def ext = "fa" + if ("$sequences" ==~ /.+\.fq|.+\.fq.gz|.+\.fastq|.+\.fastq.gz/) { + ext = "fq" + } + """ + echo "" | gzip > ${sequences}${prefix}.${ext}.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/seqtk/subseq/meta.yml b/modules/nf-core/seqtk/subseq/meta.yml new file mode 100644 index 0000000..de4a841 --- /dev/null +++ b/modules/nf-core/seqtk/subseq/meta.yml @@ -0,0 +1,40 @@ +name: seqtk_subseq +description: Select only sequences that match the filtering condition +keywords: + - filtering + - selection + - fastx +tools: + - seqtk: + description: Seqtk is a fast and lightweight tool for processing sequences in the FASTA or FASTQ format + homepage: https://github.com/lh3/seqtk + documentation: https://docs.csc.fi/apps/seqtk/ + tool_dev_url: https://github.com/lh3/seqtk + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - sequences: + type: file + description: FASTQ/FASTA file + pattern: "*.{fq,fq.gz,fa,fa.gz}" + - filter_list: + type: file + description: BED file or a text file with a list of sequence names + pattern: "*.{bed,lst}" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - sequences: + type: file + description: FASTQ/FASTA file + pattern: "*.{fq.gz,fa.gz}" +authors: + - "@sidorov-si" +maintainers: + - "@sidorov-si" diff --git a/modules/nf-core/seqtk/subseq/tests/main.nf.test b/modules/nf-core/seqtk/subseq/tests/main.nf.test new file mode 100644 index 0000000..fa8fad6 --- /dev/null +++ b/modules/nf-core/seqtk/subseq/tests/main.nf.test @@ -0,0 +1,59 @@ +nextflow_process { + + name "Test Process SEQTK_SUBSEQ" + script "modules/nf-core/seqtk/subseq/main.nf" + process "SEQTK_SUBSEQ" + config "./standard.config" + + tag "modules" + tag "modules_nfcore" + tag "seqtk" + tag "seqtk/subseq" + + test("sarscov2_subseq_fa") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[1] = file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/bed/test.bed.gz', checkIfExists: true) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2_subseq_fa_stub") { + options "-stub" + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[1] = file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/bed/test.bed.gz', checkIfExists: true) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/seqtk/subseq/tests/main.nf.test.snap b/modules/nf-core/seqtk/subseq/tests/main.nf.test.snap new file mode 100644 index 0000000..75b3793 --- /dev/null +++ b/modules/nf-core/seqtk/subseq/tests/main.nf.test.snap @@ -0,0 +1,60 @@ +{ + "sarscov2_subseq_fa": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "genome.fasta.filtered.fa.gz:md5,31c95c4d686526cf002f6119bc55b2b2" + ] + ], + "1": [ + "versions.yml:md5,cd7682f4da748ef6d083c4a4656cc1e2" + ], + "sequences": [ + [ + { + "id": "test" + }, + "genome.fasta.filtered.fa.gz:md5,31c95c4d686526cf002f6119bc55b2b2" + ] + ], + "versions": [ + "versions.yml:md5,cd7682f4da748ef6d083c4a4656cc1e2" + ] + } + ], + "timestamp": "2024-02-22T15:56:36.155954" + }, + "sarscov2_subseq_fa_stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "genome.fasta.filtered.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + "versions.yml:md5,cd7682f4da748ef6d083c4a4656cc1e2" + ], + "sequences": [ + [ + { + "id": "test" + }, + "genome.fasta.filtered.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,cd7682f4da748ef6d083c4a4656cc1e2" + ] + } + ], + "timestamp": "2024-02-22T15:56:44.222329" + } +} \ No newline at end of file diff --git a/modules/nf-core/seqtk/subseq/tests/standard.config b/modules/nf-core/seqtk/subseq/tests/standard.config new file mode 100644 index 0000000..e8d7dc3 --- /dev/null +++ b/modules/nf-core/seqtk/subseq/tests/standard.config @@ -0,0 +1,5 @@ +process { + withName: SEQTK_SUBSEQ { + ext.prefix = { ".filtered" } + } +} \ No newline at end of file diff --git a/modules/nf-core/seqtk/subseq/tests/tags.yml b/modules/nf-core/seqtk/subseq/tests/tags.yml new file mode 100644 index 0000000..74056ba --- /dev/null +++ b/modules/nf-core/seqtk/subseq/tests/tags.yml @@ -0,0 +1,2 @@ +seqtk/subseq: + - "modules/nf-core/seqtk/subseq/**" From dda1b1c4264229bc183be0baf5b18c19d5292501 Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Wed, 14 Aug 2024 14:34:31 +0100 Subject: [PATCH 3/4] added fastq option for pacbio input --- assets/samplesheet_s3.csv | 2 +- docs/usage.md | 2 +- subworkflows/local/filter_pacbio.nf | 66 +++++++++++++++++++++++------ 3 files changed, 54 insertions(+), 16 deletions(-) diff --git a/assets/samplesheet_s3.csv b/assets/samplesheet_s3.csv index b409060..ff9641b 100644 --- a/assets/samplesheet_s3.csv +++ b/assets/samplesheet_s3.csv @@ -4,4 +4,4 @@ mMelMel2,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_d mMelMel3,hic,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/hic/35528_2%231.subset.cram, mMelMel3,ont,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/ont/PAE35587_pass_1f1f0707_115.subset.fastq.gz, mMelMel3,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/pacbio/m64094_200910_173211.ccs.bc1022_BAK8B_OA--bc1022_BAK8B_OA.subset.bam, -mMelMel3,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/pacbio/m64094_200911_174739.ccs.bc1022_BAK8B_OA--bc1022_BAK8B_OA.subset.bam, +mMelMel3,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/pacbio/m64094_200911_174739.ccs.bc1022_BAK8B_OA--bc1022_BAK8B_OA.subset.fastq.gz, diff --git a/docs/usage.md b/docs/usage.md index 9a02085..2e707a9 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -42,7 +42,7 @@ sample1_T5,pacbio,pacbio2.bam,pacbio2 | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (\_). | | `datatype` | Type of sequencing data. Must be one of `hic`, `Illumina`, `pacbio`, or `ont`. | -| `datafile` | Full path to read data file. Must be `bam` or `cram` or `fastq.gz` or `fq.gz` for `Illumina` and `HiC`. Must be `bam` for `pacbio`. Must be `fastq.gz` or `fq.gz` for `ont`. | +| `datafile` | Full path to read data file. Must be `bam`, `cram`, `fastq.gz` or `fq.gz` for `Illumina` and `HiC`. Must be `bam`, `fastq.gz` or `fq.gz` for `pacbio`. Must be `fastq.gz` or `fq.gz` for `ont`. | | `library` | (Optional) The library value is a unique identifier which is assigned to read group (`@RG`) ID. If the library name is not specified, the pipeline will auto-create library name using the data filename provided in the samplesheet. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. diff --git a/subworkflows/local/filter_pacbio.nf b/subworkflows/local/filter_pacbio.nf index 759880c..acb21fa 100644 --- a/subworkflows/local/filter_pacbio.nf +++ b/subworkflows/local/filter_pacbio.nf @@ -8,6 +8,8 @@ include { SAMTOOLS_COLLATETOFASTA } from '../../modules/local/samtools include { BLAST_BLASTN } from '../../modules/nf-core/blast/blastn/main' include { PACBIO_FILTER } from '../../modules/local/pacbio_filter' include { SAMTOOLS_FILTERTOFASTQ } from '../../modules/local/samtools_filtertofastq' +include { SEQKIT_FQ2FA } from '../../modules/nf-core/seqkit/fq2fa' +include { SEQTK_SUBSEQ } from '../../modules/nf-core/seqtk/subseq' workflow FILTER_PACBIO { @@ -20,8 +22,18 @@ workflow FILTER_PACBIO { ch_versions = Channel.empty() - // Convert from PacBio BAM to Samtools BAM + // Check file types and branch reads + | branch { + meta, reads -> + fastq : reads.findAll { it.getName().toLowerCase() =~ /.*f.*\.gz/ } + bam : true + } + | set { ch_reads } + + + // Convert from PacBio BAM to Samtools BAM + ch_reads.bam | map { meta, bam -> [ meta, bam, [] ] } | set { ch_pacbio } @@ -34,8 +46,19 @@ workflow FILTER_PACBIO { ch_versions = ch_versions.mix ( SAMTOOLS_COLLATETOFASTA.out.versions.first() ) + // Convert FASTQ to FASTA using SEQKIT_FQ2FA + SEQKIT_FQ2FA ( ch_reads.fastq ) + ch_versions = ch_versions.mix ( SEQKIT_FQ2FA.out.versions.first() ) + + + // Combine BAM-derived FASTA with converted FASTQ inputs + SAMTOOLS_COLLATETOFASTA.out.fasta + | concat( SEQKIT_FQ2FA.out.fasta ) + | set { ch_fasta } + + // Nucleotide BLAST - BLAST_BLASTN ( SAMTOOLS_COLLATETOFASTA.out.fasta, db ) + BLAST_BLASTN ( ch_fasta, db ) ch_versions = ch_versions.mix ( BLAST_BLASTN.out.versions.first() ) @@ -44,25 +67,40 @@ workflow FILTER_PACBIO { ch_versions = ch_versions.mix ( PACBIO_FILTER.out.versions.first() ) - // Filter the BAM file and convert to FASTQ + // Filter the BAM files and convert to FASTQ SAMTOOLS_CONVERT.out.bam | join ( SAMTOOLS_CONVERT.out.csi ) | join ( PACBIO_FILTER.out.list ) - | set { ch_reads_and_list } + | multiMap { meta, bam, csi, list -> \ + bams: [meta, bam, csi] + lists: list + } + | set { ch_bam_reads } - ch_reads_and_list - | map { meta, bam, csi, list -> [meta, bam, csi] } - | set { ch_reads } + SAMTOOLS_FILTERTOFASTQ ( ch_bam_reads.bams, ch_bam_reads.lists ) + ch_versions = ch_versions.mix ( SAMTOOLS_FILTERTOFASTQ.out.versions.first() ) - ch_reads_and_list - | map { meta, bam, csi, list -> list } - | set { ch_lists } - SAMTOOLS_FILTERTOFASTQ ( ch_reads, ch_lists ) - ch_versions = ch_versions.mix ( SAMTOOLS_FILTERTOFASTQ.out.versions.first() ) + // Filter inputs provided as FASTQ + ch_reads.fastq + | join(PACBIO_FILTER.out.list) + | multiMap { meta, fastq, list -> \ + fastqs: [meta, fastq] + lists: list + } + | set { ch_reads_fastq } + + SEQTK_SUBSEQ ( ch_reads_fastq.fastqs, ch_reads_fastq.lists ) + ch_versions = ch_versions.mix ( SEQTK_SUBSEQ.out.versions.first() ) + + + // Merge filtered outputs as ch_output_fastq + SEQTK_SUBSEQ.out.sequences + | concat ( SAMTOOLS_FILTERTOFASTQ.out.fastq ) + | set { ch_filtered_fastq } emit: - fastq = SAMTOOLS_FILTERTOFASTQ.out.fastq // channel: [ meta, /path/to/fastq ] - versions = ch_versions // channel: [ versions.yml ] + fastq = ch_filtered_fastq // channel: [ meta, /path/to/fastq ] + versions = ch_versions // channel: [ versions.yml ] } From e2a8d462e54e29972c2136509e4161d22b4086a3 Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Wed, 14 Aug 2024 14:39:43 +0100 Subject: [PATCH 4/4] prettier linting --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 2e707a9..9423325 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -42,7 +42,7 @@ sample1_T5,pacbio,pacbio2.bam,pacbio2 | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (\_). | | `datatype` | Type of sequencing data. Must be one of `hic`, `Illumina`, `pacbio`, or `ont`. | -| `datafile` | Full path to read data file. Must be `bam`, `cram`, `fastq.gz` or `fq.gz` for `Illumina` and `HiC`. Must be `bam`, `fastq.gz` or `fq.gz` for `pacbio`. Must be `fastq.gz` or `fq.gz` for `ont`. | +| `datafile` | Full path to read data file. Must be `bam`, `cram`, `fastq.gz` or `fq.gz` for `Illumina` and `HiC`. Must be `bam`, `fastq.gz` or `fq.gz` for `pacbio`. Must be `fastq.gz` or `fq.gz` for `ont`. | | `library` | (Optional) The library value is a unique identifier which is assigned to read group (`@RG`) ID. If the library name is not specified, the pipeline will auto-create library name using the data filename provided in the samplesheet. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.