From 8d32f1f0652eafe2107776417277b904d6b5c01e Mon Sep 17 00:00:00 2001
From: Tyler Chafin <tc25@sanger.ac.uk>
Date: Wed, 14 Aug 2024 11:18:16 +0100
Subject: [PATCH 1/4] nf-core modules install seqkit/fq2fa

---
 modules.json                                  |  5 ++
 modules/nf-core/seqkit/fq2fa/environment.yml  |  7 ++
 modules/nf-core/seqkit/fq2fa/main.nf          | 48 +++++++++++++
 modules/nf-core/seqkit/fq2fa/meta.yml         | 44 ++++++++++++
 .../nf-core/seqkit/fq2fa/tests/main.nf.test   | 56 +++++++++++++++
 .../seqkit/fq2fa/tests/main.nf.test.snap      | 72 +++++++++++++++++++
 modules/nf-core/seqkit/fq2fa/tests/tags.yml   |  2 +
 7 files changed, 234 insertions(+)
 create mode 100644 modules/nf-core/seqkit/fq2fa/environment.yml
 create mode 100644 modules/nf-core/seqkit/fq2fa/main.nf
 create mode 100644 modules/nf-core/seqkit/fq2fa/meta.yml
 create mode 100644 modules/nf-core/seqkit/fq2fa/tests/main.nf.test
 create mode 100644 modules/nf-core/seqkit/fq2fa/tests/main.nf.test.snap
 create mode 100644 modules/nf-core/seqkit/fq2fa/tests/tags.yml

diff --git a/modules.json b/modules.json
index 113030b..60d80ef 100644
--- a/modules.json
+++ b/modules.json
@@ -86,6 +86,11 @@
                         "git_sha": "6c2309aaec566c0d44a6cf14d4b2d0c51afe2e91",
                         "installed_by": ["modules"]
                     },
+                    "seqkit/fq2fa": {
+                        "branch": "master",
+                        "git_sha": "03fbf6c89e551bd8d77f3b751fb5c955f75b34c5",
+                        "installed_by": ["modules"]
+                    },
                     "untar": {
                         "branch": "master",
                         "git_sha": "4e5f4687318f24ba944a13609d3ea6ebd890737d",
diff --git a/modules/nf-core/seqkit/fq2fa/environment.yml b/modules/nf-core/seqkit/fq2fa/environment.yml
new file mode 100644
index 0000000..aede676
--- /dev/null
+++ b/modules/nf-core/seqkit/fq2fa/environment.yml
@@ -0,0 +1,7 @@
+name: seqkit_fq2fa
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::seqkit=2.8.1
diff --git a/modules/nf-core/seqkit/fq2fa/main.nf b/modules/nf-core/seqkit/fq2fa/main.nf
new file mode 100644
index 0000000..77462ad
--- /dev/null
+++ b/modules/nf-core/seqkit/fq2fa/main.nf
@@ -0,0 +1,48 @@
+process SEQKIT_FQ2FA {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/seqkit:2.8.1--h9ee0642_0' :
+        'biocontainers/seqkit:2.8.1--h9ee0642_0' }"
+
+    input:
+    tuple val(meta), path(fastq)
+
+    output:
+    tuple val(meta), path("*.fa.gz"), emit: fasta
+    path "versions.yml"             , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+
+    """
+    seqkit \\
+        fq2fa \\
+        $args \\
+        -j $task.cpus \\
+        -o ${prefix}.fa.gz \\
+        $fastq
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        seqkit: \$( seqkit | sed '3!d; s/Version: //' )
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    echo "" | gzip > ${prefix}.fa.gz
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        seqkit: \$( seqkit | sed '3!d; s/Version: //' )
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/seqkit/fq2fa/meta.yml b/modules/nf-core/seqkit/fq2fa/meta.yml
new file mode 100644
index 0000000..d0c55b3
--- /dev/null
+++ b/modules/nf-core/seqkit/fq2fa/meta.yml
@@ -0,0 +1,44 @@
+name: "seqkit_fq2fa"
+description: Convert FASTQ to FASTA format
+keywords:
+  - fastq
+  - fasta
+  - convert
+tools:
+  - "seqkit":
+      description: "Cross-platform and ultrafast toolkit for FASTA/Q file manipulation, written by Wei Shen."
+      homepage: "https://github.com/shenwei356/seqkit"
+      documentation: "https://bioinf.shenwei.me/seqkit/"
+      doi: "10.1371/journal.pone.0163962"
+      licence: ["MIT"]
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. `[ id:'test', single_end:false ]`
+
+  - fastq:
+      type: file
+      description: Sequence file in fastq format
+      pattern: "*.{fastq,fq}.gz"
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. `[ id:'test', single_end:false ]`
+
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - fasta:
+      type: file
+      description: Sequence file in fasta format
+      pattern: "*.{fasta,fa}.gz"
+
+authors:
+  - "@d-jch"
diff --git a/modules/nf-core/seqkit/fq2fa/tests/main.nf.test b/modules/nf-core/seqkit/fq2fa/tests/main.nf.test
new file mode 100644
index 0000000..08f399e
--- /dev/null
+++ b/modules/nf-core/seqkit/fq2fa/tests/main.nf.test
@@ -0,0 +1,56 @@
+nextflow_process {
+
+    name "Test Process SEQKIT_FQ2FA"
+    script "../main.nf"
+    process "SEQKIT_FQ2FA"
+
+    tag "modules"
+    tag "modules_nfcore"
+    tag "seqkit"
+    tag "seqkit/fq2fa"
+
+    test("sarscov2 - bam") {
+
+        when {
+            process {
+                """
+                input[0] = [[ id:'test', single_end:false ], // meta map
+                            [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ]
+                ]
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+
+    }
+
+    test("sarscov2 - bam - stub") {
+
+        options "-stub"
+
+        when {
+            process {
+                """
+                input[0] = [[ id:'test', single_end:false ], // meta map
+                            [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ]
+                ]
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+
+    }
+
+}
diff --git a/modules/nf-core/seqkit/fq2fa/tests/main.nf.test.snap b/modules/nf-core/seqkit/fq2fa/tests/main.nf.test.snap
new file mode 100644
index 0000000..b10ff75
--- /dev/null
+++ b/modules/nf-core/seqkit/fq2fa/tests/main.nf.test.snap
@@ -0,0 +1,72 @@
+{
+    "sarscov2 - bam - stub": {
+        "content": [
+            {
+                "0": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false
+                        },
+                        "test.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940"
+                    ]
+                ],
+                "1": [
+                    "versions.yml:md5,70efc6839fd6443ee9116c082a730f72"
+                ],
+                "fasta": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false
+                        },
+                        "test.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940"
+                    ]
+                ],
+                "versions": [
+                    "versions.yml:md5,70efc6839fd6443ee9116c082a730f72"
+                ]
+            }
+        ],
+        "meta": {
+            "nf-test": "0.8.4",
+            "nextflow": "23.10.1"
+        },
+        "timestamp": "2024-05-13T08:56:21.234724552"
+    },
+    "sarscov2 - bam": {
+        "content": [
+            {
+                "0": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false
+                        },
+                        "test.fa.gz:md5,f0c5c9110ce19e9ebbc9a6b6baf9e105"
+                    ]
+                ],
+                "1": [
+                    "versions.yml:md5,70efc6839fd6443ee9116c082a730f72"
+                ],
+                "fasta": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false
+                        },
+                        "test.fa.gz:md5,f0c5c9110ce19e9ebbc9a6b6baf9e105"
+                    ]
+                ],
+                "versions": [
+                    "versions.yml:md5,70efc6839fd6443ee9116c082a730f72"
+                ]
+            }
+        ],
+        "meta": {
+            "nf-test": "0.8.4",
+            "nextflow": "23.10.1"
+        },
+        "timestamp": "2024-05-13T08:55:54.648865102"
+    }
+}
\ No newline at end of file
diff --git a/modules/nf-core/seqkit/fq2fa/tests/tags.yml b/modules/nf-core/seqkit/fq2fa/tests/tags.yml
new file mode 100644
index 0000000..004f102
--- /dev/null
+++ b/modules/nf-core/seqkit/fq2fa/tests/tags.yml
@@ -0,0 +1,2 @@
+seqkit/fq2fa:
+  - "modules/nf-core/seqkit/fq2fa/**"

From 33b0247d3ec475298bcc01d60937aa881ecbc50e Mon Sep 17 00:00:00 2001
From: Tyler Chafin <tc25@sanger.ac.uk>
Date: Wed, 14 Aug 2024 11:30:28 +0100
Subject: [PATCH 2/4] nf-core modules install seqtk/subseq

---
 modules.json                                  |  5 ++
 modules/nf-core/seqtk/subseq/environment.yml  |  7 +++
 modules/nf-core/seqtk/subseq/main.nf          | 56 +++++++++++++++++
 modules/nf-core/seqtk/subseq/meta.yml         | 40 +++++++++++++
 .../nf-core/seqtk/subseq/tests/main.nf.test   | 59 ++++++++++++++++++
 .../seqtk/subseq/tests/main.nf.test.snap      | 60 +++++++++++++++++++
 .../seqtk/subseq/tests/standard.config        |  5 ++
 modules/nf-core/seqtk/subseq/tests/tags.yml   |  2 +
 8 files changed, 234 insertions(+)
 create mode 100644 modules/nf-core/seqtk/subseq/environment.yml
 create mode 100644 modules/nf-core/seqtk/subseq/main.nf
 create mode 100644 modules/nf-core/seqtk/subseq/meta.yml
 create mode 100644 modules/nf-core/seqtk/subseq/tests/main.nf.test
 create mode 100644 modules/nf-core/seqtk/subseq/tests/main.nf.test.snap
 create mode 100644 modules/nf-core/seqtk/subseq/tests/standard.config
 create mode 100644 modules/nf-core/seqtk/subseq/tests/tags.yml

diff --git a/modules.json b/modules.json
index 60d80ef..0653c8f 100644
--- a/modules.json
+++ b/modules.json
@@ -91,6 +91,11 @@
                         "git_sha": "03fbf6c89e551bd8d77f3b751fb5c955f75b34c5",
                         "installed_by": ["modules"]
                     },
+                    "seqtk/subseq": {
+                        "branch": "master",
+                        "git_sha": "730f3aee80d5f8d0b5fc532202ac59361414d006",
+                        "installed_by": ["modules"]
+                    },
                     "untar": {
                         "branch": "master",
                         "git_sha": "4e5f4687318f24ba944a13609d3ea6ebd890737d",
diff --git a/modules/nf-core/seqtk/subseq/environment.yml b/modules/nf-core/seqtk/subseq/environment.yml
new file mode 100644
index 0000000..7abe364
--- /dev/null
+++ b/modules/nf-core/seqtk/subseq/environment.yml
@@ -0,0 +1,7 @@
+name: seqtk_subseq
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::seqtk=1.4
diff --git a/modules/nf-core/seqtk/subseq/main.nf b/modules/nf-core/seqtk/subseq/main.nf
new file mode 100644
index 0000000..d5caebc
--- /dev/null
+++ b/modules/nf-core/seqtk/subseq/main.nf
@@ -0,0 +1,56 @@
+process SEQTK_SUBSEQ {
+    tag "$sequences"
+    label 'process_single'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/seqtk:1.4--he4a0461_1' :
+        'biocontainers/seqtk:1.4--he4a0461_1' }"
+
+    input:
+    tuple val(meta), path(sequences)
+    path filter_list
+
+    output:
+    tuple val(meta), path("*.gz"),  emit: sequences
+    path "versions.yml",            emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args   = task.ext.args   ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def ext = "fa"
+    if ("$sequences" ==~ /.+\.fq|.+\.fq.gz|.+\.fastq|.+\.fastq.gz/) {
+        ext = "fq"
+    }
+    """
+    seqtk \\
+        subseq \\
+        $args \\
+        $sequences \\
+        $filter_list | \\
+        gzip --no-name > ${sequences}${prefix}.${ext}.gz
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//')
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def ext = "fa"
+    if ("$sequences" ==~ /.+\.fq|.+\.fq.gz|.+\.fastq|.+\.fastq.gz/) {
+        ext = "fq"
+    }
+    """
+    echo "" | gzip > ${sequences}${prefix}.${ext}.gz
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/seqtk/subseq/meta.yml b/modules/nf-core/seqtk/subseq/meta.yml
new file mode 100644
index 0000000..de4a841
--- /dev/null
+++ b/modules/nf-core/seqtk/subseq/meta.yml
@@ -0,0 +1,40 @@
+name: seqtk_subseq
+description: Select only sequences that match the filtering condition
+keywords:
+  - filtering
+  - selection
+  - fastx
+tools:
+  - seqtk:
+      description: Seqtk is a fast and lightweight tool for processing sequences in the FASTA or FASTQ format
+      homepage: https://github.com/lh3/seqtk
+      documentation: https://docs.csc.fi/apps/seqtk/
+      tool_dev_url: https://github.com/lh3/seqtk
+      licence: ["MIT"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test' ]
+  - sequences:
+      type: file
+      description: FASTQ/FASTA file
+      pattern: "*.{fq,fq.gz,fa,fa.gz}"
+  - filter_list:
+      type: file
+      description: BED file or a text file with a list of sequence names
+      pattern: "*.{bed,lst}"
+output:
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - sequences:
+      type: file
+      description: FASTQ/FASTA file
+      pattern: "*.{fq.gz,fa.gz}"
+authors:
+  - "@sidorov-si"
+maintainers:
+  - "@sidorov-si"
diff --git a/modules/nf-core/seqtk/subseq/tests/main.nf.test b/modules/nf-core/seqtk/subseq/tests/main.nf.test
new file mode 100644
index 0000000..fa8fad6
--- /dev/null
+++ b/modules/nf-core/seqtk/subseq/tests/main.nf.test
@@ -0,0 +1,59 @@
+nextflow_process {
+
+    name "Test Process SEQTK_SUBSEQ"
+    script "modules/nf-core/seqtk/subseq/main.nf"
+    process "SEQTK_SUBSEQ"
+    config "./standard.config"
+
+    tag "modules"
+    tag "modules_nfcore"
+    tag "seqtk"
+    tag "seqtk/subseq"
+
+    test("sarscov2_subseq_fa") {
+
+        when {
+            process {
+                """
+                input[0] =  [
+                                [ id:'test' ],
+                                file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)
+                            ]
+                input[1] =  file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/bed/test.bed.gz', checkIfExists: true)
+                """
+            }
+        }
+
+        then {
+            assertAll (
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+
+    }
+
+    test("sarscov2_subseq_fa_stub") {
+        options "-stub"
+        when {
+            process {
+                """
+                input[0] =  [
+                                [ id:'test' ],
+                                file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)
+                            ]
+                input[1] =  file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/bed/test.bed.gz', checkIfExists: true)
+                """
+            }
+        }
+
+        then {
+            assertAll (
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+
+    }
+
+}
diff --git a/modules/nf-core/seqtk/subseq/tests/main.nf.test.snap b/modules/nf-core/seqtk/subseq/tests/main.nf.test.snap
new file mode 100644
index 0000000..75b3793
--- /dev/null
+++ b/modules/nf-core/seqtk/subseq/tests/main.nf.test.snap
@@ -0,0 +1,60 @@
+{
+    "sarscov2_subseq_fa": {
+        "content": [
+            {
+                "0": [
+                    [
+                        {
+                            "id": "test"
+                        },
+                        "genome.fasta.filtered.fa.gz:md5,31c95c4d686526cf002f6119bc55b2b2"
+                    ]
+                ],
+                "1": [
+                    "versions.yml:md5,cd7682f4da748ef6d083c4a4656cc1e2"
+                ],
+                "sequences": [
+                    [
+                        {
+                            "id": "test"
+                        },
+                        "genome.fasta.filtered.fa.gz:md5,31c95c4d686526cf002f6119bc55b2b2"
+                    ]
+                ],
+                "versions": [
+                    "versions.yml:md5,cd7682f4da748ef6d083c4a4656cc1e2"
+                ]
+            }
+        ],
+        "timestamp": "2024-02-22T15:56:36.155954"
+    },
+    "sarscov2_subseq_fa_stub": {
+        "content": [
+            {
+                "0": [
+                    [
+                        {
+                            "id": "test"
+                        },
+                        "genome.fasta.filtered.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940"
+                    ]
+                ],
+                "1": [
+                    "versions.yml:md5,cd7682f4da748ef6d083c4a4656cc1e2"
+                ],
+                "sequences": [
+                    [
+                        {
+                            "id": "test"
+                        },
+                        "genome.fasta.filtered.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940"
+                    ]
+                ],
+                "versions": [
+                    "versions.yml:md5,cd7682f4da748ef6d083c4a4656cc1e2"
+                ]
+            }
+        ],
+        "timestamp": "2024-02-22T15:56:44.222329"
+    }
+}
\ No newline at end of file
diff --git a/modules/nf-core/seqtk/subseq/tests/standard.config b/modules/nf-core/seqtk/subseq/tests/standard.config
new file mode 100644
index 0000000..e8d7dc3
--- /dev/null
+++ b/modules/nf-core/seqtk/subseq/tests/standard.config
@@ -0,0 +1,5 @@
+process {
+    withName: SEQTK_SUBSEQ {
+        ext.prefix = { ".filtered" }
+    }
+}
\ No newline at end of file
diff --git a/modules/nf-core/seqtk/subseq/tests/tags.yml b/modules/nf-core/seqtk/subseq/tests/tags.yml
new file mode 100644
index 0000000..74056ba
--- /dev/null
+++ b/modules/nf-core/seqtk/subseq/tests/tags.yml
@@ -0,0 +1,2 @@
+seqtk/subseq:
+  - "modules/nf-core/seqtk/subseq/**"

From dda1b1c4264229bc183be0baf5b18c19d5292501 Mon Sep 17 00:00:00 2001
From: Tyler Chafin <tc25@sanger.ac.uk>
Date: Wed, 14 Aug 2024 14:34:31 +0100
Subject: [PATCH 3/4] added fastq option for pacbio input

---
 assets/samplesheet_s3.csv           |  2 +-
 docs/usage.md                       |  2 +-
 subworkflows/local/filter_pacbio.nf | 66 +++++++++++++++++++++++------
 3 files changed, 54 insertions(+), 16 deletions(-)

diff --git a/assets/samplesheet_s3.csv b/assets/samplesheet_s3.csv
index b409060..ff9641b 100644
--- a/assets/samplesheet_s3.csv
+++ b/assets/samplesheet_s3.csv
@@ -4,4 +4,4 @@ mMelMel2,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_d
 mMelMel3,hic,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/hic/35528_2%231.subset.cram,
 mMelMel3,ont,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/ont/PAE35587_pass_1f1f0707_115.subset.fastq.gz,
 mMelMel3,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/pacbio/m64094_200910_173211.ccs.bc1022_BAK8B_OA--bc1022_BAK8B_OA.subset.bam,
-mMelMel3,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/pacbio/m64094_200911_174739.ccs.bc1022_BAK8B_OA--bc1022_BAK8B_OA.subset.bam,
+mMelMel3,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/pacbio/m64094_200911_174739.ccs.bc1022_BAK8B_OA--bc1022_BAK8B_OA.subset.fastq.gz,
diff --git a/docs/usage.md b/docs/usage.md
index 9a02085..2e707a9 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -42,7 +42,7 @@ sample1_T5,pacbio,pacbio2.bam,pacbio2
 | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `sample`   | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (\_).                                                 |
 | `datatype` | Type of sequencing data. Must be one of `hic`, `Illumina`, `pacbio`, or `ont`.                                                                                                                                                        |
-| `datafile` | Full path to read data file. Must be `bam` or `cram` or `fastq.gz` or `fq.gz` for `Illumina` and `HiC`. Must be `bam` for `pacbio`. Must be `fastq.gz` or `fq.gz` for `ont`.                                                          |
+| `datafile` | Full path to read data file. Must be `bam`, `cram`, `fastq.gz` or `fq.gz` for `Illumina` and `HiC`. Must be `bam`, `fastq.gz` or `fq.gz` for `pacbio`. Must be `fastq.gz` or `fq.gz` for `ont`.                          |
 | `library`  | (Optional) The library value is a unique identifier which is assigned to read group (`@RG`) ID. If the library name is not specified, the pipeline will auto-create library name using the data filename provided in the samplesheet. |
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
diff --git a/subworkflows/local/filter_pacbio.nf b/subworkflows/local/filter_pacbio.nf
index 759880c..acb21fa 100644
--- a/subworkflows/local/filter_pacbio.nf
+++ b/subworkflows/local/filter_pacbio.nf
@@ -8,6 +8,8 @@ include { SAMTOOLS_COLLATETOFASTA           } from '../../modules/local/samtools
 include { BLAST_BLASTN                      } from '../../modules/nf-core/blast/blastn/main'
 include { PACBIO_FILTER                     } from '../../modules/local/pacbio_filter'
 include { SAMTOOLS_FILTERTOFASTQ            } from '../../modules/local/samtools_filtertofastq'
+include { SEQKIT_FQ2FA                      } from '../../modules/nf-core/seqkit/fq2fa'
+include { SEQTK_SUBSEQ                      } from '../../modules/nf-core/seqtk/subseq'
 
 
 workflow FILTER_PACBIO {
@@ -20,8 +22,18 @@ workflow FILTER_PACBIO {
     ch_versions = Channel.empty()
 
 
-    // Convert from PacBio BAM to Samtools BAM
+    // Check file types and branch
     reads
+    | branch {
+        meta, reads ->
+            fastq : reads.findAll { it.getName().toLowerCase() =~ /.*f.*\.gz/ }
+            bam : true
+    }
+    | set { ch_reads }
+
+
+    // Convert from PacBio BAM to Samtools BAM
+    ch_reads.bam
     | map { meta, bam -> [ meta, bam, [] ] }
     | set { ch_pacbio }
 
@@ -34,8 +46,19 @@ workflow FILTER_PACBIO {
     ch_versions = ch_versions.mix ( SAMTOOLS_COLLATETOFASTA.out.versions.first() )
 
 
+    // Convert FASTQ to FASTA using SEQKIT_FQ2FA
+    SEQKIT_FQ2FA ( ch_reads.fastq )
+    ch_versions = ch_versions.mix ( SEQKIT_FQ2FA.out.versions.first() )
+
+
+    // Combine BAM-derived FASTA with converted FASTQ inputs
+    SAMTOOLS_COLLATETOFASTA.out.fasta
+    | concat( SEQKIT_FQ2FA.out.fasta )
+    | set { ch_fasta }
+
+
     // Nucleotide BLAST
-    BLAST_BLASTN ( SAMTOOLS_COLLATETOFASTA.out.fasta, db )
+    BLAST_BLASTN ( ch_fasta, db )
     ch_versions = ch_versions.mix ( BLAST_BLASTN.out.versions.first() )
 
 
@@ -44,25 +67,40 @@ workflow FILTER_PACBIO {
     ch_versions = ch_versions.mix ( PACBIO_FILTER.out.versions.first() )
 
 
-    // Filter the BAM file and convert to FASTQ
+    // Filter the BAM files and convert to FASTQ
     SAMTOOLS_CONVERT.out.bam
     | join ( SAMTOOLS_CONVERT.out.csi )
     | join ( PACBIO_FILTER.out.list )
-    | set { ch_reads_and_list }
+    | multiMap { meta, bam, csi, list -> \
+            bams: [meta, bam, csi]
+            lists: list
+    }
+    | set { ch_bam_reads }
 
-    ch_reads_and_list
-    | map { meta, bam, csi, list -> [meta, bam, csi] }
-    | set { ch_reads }
+    SAMTOOLS_FILTERTOFASTQ ( ch_bam_reads.bams, ch_bam_reads.lists )
+    ch_versions = ch_versions.mix ( SAMTOOLS_FILTERTOFASTQ.out.versions.first() )
 
-    ch_reads_and_list
-    | map { meta, bam, csi, list -> list }
-    | set { ch_lists }
 
-    SAMTOOLS_FILTERTOFASTQ ( ch_reads, ch_lists )
-    ch_versions = ch_versions.mix ( SAMTOOLS_FILTERTOFASTQ.out.versions.first() )
+    // Filter inputs provided as FASTQ
+    ch_reads.fastq
+    | join(PACBIO_FILTER.out.list)
+    | multiMap { meta, fastq, list -> \
+            fastqs: [meta, fastq]
+            lists: list
+    }
+    | set { ch_reads_fastq }
+
+    SEQTK_SUBSEQ ( ch_reads_fastq.fastqs, ch_reads_fastq.lists )
+    ch_versions = ch_versions.mix ( SEQTK_SUBSEQ.out.versions.first() )
+
+
+    // Merge filtered outputs as ch_output_fastq
+    SEQTK_SUBSEQ.out.sequences
+    | concat ( SAMTOOLS_FILTERTOFASTQ.out.fastq )
+    | set { ch_filtered_fastq }
 
 
     emit:
-    fastq    = SAMTOOLS_FILTERTOFASTQ.out.fastq     // channel: [ meta, /path/to/fastq ]
-    versions = ch_versions                          // channel: [ versions.yml ]
+    fastq    = ch_filtered_fastq        // channel: [ meta, /path/to/fastq ]
+    versions = ch_versions              // channel: [ versions.yml ]
 }

From e2a8d462e54e29972c2136509e4161d22b4086a3 Mon Sep 17 00:00:00 2001
From: Tyler Chafin <tc25@sanger.ac.uk>
Date: Wed, 14 Aug 2024 14:39:43 +0100
Subject: [PATCH 4/4] prettier linting

---
 docs/usage.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/usage.md b/docs/usage.md
index 2e707a9..9423325 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -42,7 +42,7 @@ sample1_T5,pacbio,pacbio2.bam,pacbio2
 | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `sample`   | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (\_).                                                 |
 | `datatype` | Type of sequencing data. Must be one of `hic`, `Illumina`, `pacbio`, or `ont`.                                                                                                                                                        |
-| `datafile` | Full path to read data file. Must be `bam`, `cram`, `fastq.gz` or `fq.gz` for `Illumina` and `HiC`. Must be `bam`, `fastq.gz` or `fq.gz` for `pacbio`. Must be `fastq.gz` or `fq.gz` for `ont`.                          |
+| `datafile` | Full path to read data file. Must be `bam`, `cram`, `fastq.gz` or `fq.gz` for `Illumina` and `HiC`. Must be `bam`, `fastq.gz` or `fq.gz` for `pacbio`. Must be `fastq.gz` or `fq.gz` for `ont`.                                       |
 | `library`  | (Optional) The library value is a unique identifier which is assigned to read group (`@RG`) ID. If the library name is not specified, the pipeline will auto-create library name using the data filename provided in the samplesheet. |
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.