From 3d4521ac85b7c4e74db5ba1c2d49256e9b84b547 Mon Sep 17 00:00:00 2001 From: Yuk Kei Wan Date: Thu, 19 Sep 2024 17:21:34 +0800 Subject: [PATCH] add dorado modifications --- bin/check_samplesheet.py | 2 +- conf/test_bc_nodx.config | 2 +- conf/test_bc_nodx_dnamod.config | 36 +++++ modules/local/bambu.nf | 4 +- modules/local/dorado_aligner.nf | 25 ++++ .../local/{dorado.nf => dorado_basecaller.nf} | 9 +- modules/local/get_test_data.nf | 1 + modules/local/modkit_bedmethyl.nf | 27 ---- modules/local/modkit_pileup.nf | 29 ++++ nextflow.config | 4 + workflows/nanoseq.nf | 125 +++++++++++------- 11 files changed, 179 insertions(+), 85 deletions(-) create mode 100644 conf/test_bc_nodx_dnamod.config create mode 100644 modules/local/dorado_aligner.nf rename modules/local/{dorado.nf => dorado_basecaller.nf} (65%) delete mode 100644 modules/local/modkit_bedmethyl.nf create mode 100644 modules/local/modkit_pileup.nf diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index f1458e06..ec19351e 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -116,7 +116,7 @@ def check_samplesheet(file_in, updated_path, file_out): input_file = "/".join([updated_path, input_file.split("/")[-1]]) list_dir = os.listdir(input_file) fast5 = input_file - if not (all(fname.endswith(".fast5") for fname in list_dir)): + if not (all(fname.endswith(".fast5") for fname in list_dir)) and not (all(fname.endswith(".pod5") for fname in list_dir)): if "fast5" in list_dir and "fastq" in list_dir: fast5 = input_file + "/fast5" ## CHECK FAST5 DIRECTORY diff --git a/conf/test_bc_nodx.config b/conf/test_bc_nodx.config index 1a70cb99..743ceb65 100644 --- a/conf/test_bc_nodx.config +++ b/conf/test_bc_nodx.config @@ -17,7 +17,7 @@ params { max_time = 12.h // Input data to perform basecalling and to skip demultipexing - input = 'https://raw.githubusercontent.com/yuukiiwa/test-datasets/nanoseq/3.2/samplesheet/samplesheet_bc_nodx.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_bc_nodx.csv' fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa' protocol = 'cDNA' flowcell = 'FLO-MIN106' diff --git a/conf/test_bc_nodx_dnamod.config b/conf/test_bc_nodx_dnamod.config new file mode 100644 index 00000000..a3cdd729 --- /dev/null +++ b/conf/test_bc_nodx_dnamod.config @@ -0,0 +1,36 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/nanoseq -profile test_bc_nodx, + */ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on Travis + max_cpus = 2 + max_memory = 6.GB + max_time = 12.h + + // Input data to perform basecalling and to skip demultipexing + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/nanoseq/3.2/samplesheet/samplesheet_bc_nodx_dnamod.csv' + input_path_file_type= 'pod5' + bedmethyl_out = true + fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa' + protocol = 'cDNA' + flowcell = 'FLO-MIN106' + kit = 'SQK-DCS108' + dorado_model = 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0' + dorado_modification = '5mCG_5hmCG' + dorado_device = 'cpu' + skip_bigbed = true + skip_bigwig = true + skip_demultiplexing = true + skip_quantification = true + skip_fusion_analysis= true + skip_modification_analysis=true +} diff --git a/modules/local/bambu.nf b/modules/local/bambu.nf index 8a0abd6d..d757c274 100644 --- a/modules/local/bambu.nf +++ b/modules/local/bambu.nf @@ -3,8 +3,8 @@ process BAMBU { conda "conda-forge::r-base=4.0.3 bioconda::bioconductor-bambu=3.0.8 bioconda::bioconductor-bsgenome=1.66.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bioconductor-bambu:3.2.4--r43hf17093f_0' : - 'quay.io/biocontainers/bioconductor-bambu:3.2.4--r43hf17093f_0' }" + 'https://depot.galaxyproject.org/singularity/bioconductor-bambu:3.4.0--r43hf17093f_0' : + 'quay.io/biocontainers/bioconductor-bambu:3.4.0--r43hf17093f_0' }" input: tuple path(fasta), path(gtf) diff --git a/modules/local/dorado_aligner.nf b/modules/local/dorado_aligner.nf new file mode 100644 index 00000000..3fef036d --- /dev/null +++ b/modules/local/dorado_aligner.nf @@ -0,0 +1,25 @@ +process DORADO_ALIGNER { + tag "$meta.id" + label 'process_medium' + + container "docker.io/ontresearch/dorado" + + input: + tuple val(meta), path(mod_bam) + path fasta + + output: + tuple val(meta), path("aligned_sorted.bam"), path("*.bai") , emit: aligned_bam + path "versions.yml" , emit: versions + + script: + """ + dorado aligner --mm2-preset map-ont $fasta $mod_bam > aligned.bam && samtools sort aligned.bam -o aligned_sorted.bam && samtools index aligned_sorted.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dorado: \$(echo \$(dorado --version 2>&1) | sed -r 's/.{81}//') + END_VERSIONS + """ +} + diff --git a/modules/local/dorado.nf b/modules/local/dorado_basecaller.nf similarity index 65% rename from modules/local/dorado.nf rename to modules/local/dorado_basecaller.nf index 0e40f47e..200e7bb1 100644 --- a/modules/local/dorado.nf +++ b/modules/local/dorado_basecaller.nf @@ -1,4 +1,4 @@ -process DORADO { +process DORADO_BASECALLER { tag "$meta.id" label 'process_medium' @@ -10,20 +10,19 @@ process DORADO { val dorado_model output: - tuple val(meta), path("*.fastq.gz") , emit: fastq + tuple val(meta), path("basecall*") , emit: dorado_out path "versions.yml" , emit: versions script: + def emit_args = (params.dorado_modification == null) ? " --emit-fastq > basecall.fastq && gzip basecall.fastq" : " --modified-bases $params.dorado_modification > basecall.bam" """ dorado download --model $dorado_model - dorado basecaller $dorado_model $pod5_path --device $dorado_device --emit-fastq > basecall.fastq + dorado basecaller $dorado_model $pod5_path --device $dorado_device $emit_args cat <<-END_VERSIONS > versions.yml "${task.process}": dorado: \$(echo \$(dorado --version 2>&1) | sed -r 's/.{81}//') END_VERSIONS - - gzip basecall.fastq """ } diff --git a/modules/local/get_test_data.nf b/modules/local/get_test_data.nf index 442b23c8..d2aa4d3c 100644 --- a/modules/local/get_test_data.nf +++ b/modules/local/get_test_data.nf @@ -6,6 +6,7 @@ process GET_TEST_DATA { output: path "test-datasets/fast5/$barcoded/" , emit: ch_input_fast5_dir_path path "test-datasets/modification_fast5_fastq/", emit: ch_input_dir_path + path "test-datasets/pod5/" , emit: ch_pod5_dir_path path "versions.yml" , emit: versions when: diff --git a/modules/local/modkit_bedmethyl.nf b/modules/local/modkit_bedmethyl.nf deleted file mode 100644 index 4a9ae7c3..00000000 --- a/modules/local/modkit_bedmethyl.nf +++ /dev/null @@ -1,27 +0,0 @@ -process DORADO { - tag "$meta.id" - label 'process_medium' - - container "modkit" - - input: - tuple val(meta), path(mod_bam) - - output: - tuple val(meta), path("*.bed") , emit: bedmethyl - path "versions.yml" , emit: versions - - script: - bedmethyl = "$meta.id" +".bed" - """ - modkit pileup $mod_bam $bedmethyl - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - dorado: \$(echo \$(dorado --version 2>&1) | sed -r 's/.{81}//') - END_VERSIONS - - gzip basecall.fastq - """ -} - diff --git a/modules/local/modkit_pileup.nf b/modules/local/modkit_pileup.nf new file mode 100644 index 00000000..02a5d2e8 --- /dev/null +++ b/modules/local/modkit_pileup.nf @@ -0,0 +1,29 @@ +process MODKIT_PILEUP { + tag "$meta.id" + label 'process_high_memory' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ont-modkit:0.4.0--h5c23e0d_0' : + 'quay.io/biocontainers/ont-modkit:0.4.0--h5c23e0d_0' }" + + input: + tuple val(meta), path(aligned_mod_bam), path(bai) + + output: + tuple val(meta), path(bedmethyl) , emit: bedmethyl + path "versions.yml" , emit: versions + + script: + bedmethyl = "$meta.id" +".bed" + """ + modkit pileup $aligned_mod_bam $bedmethyl --threads $task.cpus + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + modkit: \$(echo \$(modkit --version 2>&1) | sed -r 's/.{81}//') + END_VERSIONS + + gzip basecall.fastq + """ +} + diff --git a/nextflow.config b/nextflow.config index 5e3a5f8b..1bb89fc4 100644 --- a/nextflow.config +++ b/nextflow.config @@ -21,12 +21,15 @@ params { // Options: Basecalling and Demultiplexing input_path = null + input_path_file_type = 'fast5' + bedmethyl_out = false flowcell = null kit = null barcode_kit = null barcode_both_ends = false trim_barcodes = false dorado_model = null + doraro_modification = null dorado_device = 'cuda:all' qcat_min_score = 60 qcat_detect_middle = false @@ -227,6 +230,7 @@ profiles { test { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } test_bc_nodx { includeConfig 'conf/test_bc_nodx.config' } + test_bc_nodx_dnamod { includeConfig 'conf/test_bc_nodx_dnamod.config' } test_nobc_dx { includeConfig 'conf/test_nobc_dx.config' } test_nobc_nodx_stringtie { includeConfig 'conf/test_nobc_nodx_stringtie.config' } test_nobc_nodx_noaln { includeConfig 'conf/test_nobc_nodx_noaln.config' } diff --git a/workflows/nanoseq.nf b/workflows/nanoseq.nf index 483096ef..0369dd27 100644 --- a/workflows/nanoseq.nf +++ b/workflows/nanoseq.nf @@ -125,7 +125,9 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multi include { GET_TEST_DATA } from '../modules/local/get_test_data' include { GET_NANOLYSE_FASTA } from '../modules/local/get_nanolyse_fasta' include { FAST5_TO_POD5 } from '../modules/local/fast5_to_pod5' -include { DORADO } from '../modules/local/dorado' +include { DORADO_BASECALLER } from '../modules/local/dorado_basecaller' +include { DORADO_ALIGNER } from '../modules/local/dorado_aligner' +include { MODKIT_PILEUP } from '../modules/local/modkit_pileup' include { GTF2BED } from '../modules/local/gtf2bed' include { BAM_RENAME } from '../modules/local/bam_rename' include { BAMBU } from '../modules/local/bambu' @@ -181,8 +183,13 @@ workflow NANOSEQ{ if (!isOffline()) { GET_TEST_DATA () if (params.skip_modification_analysis) { - GET_TEST_DATA.out.ch_input_fast5_dir_path - .set { ch_input_path } + if (params.bedmethyl_out){ + GET_TEST_DATA.out.ch_pod5_dir_path + .set { ch_input_path } + } else { + GET_TEST_DATA.out.ch_input_fast5_dir_path + .set { ch_input_path } + } } else { GET_TEST_DATA.out.ch_input_dir_path .set { ch_input_path } @@ -216,51 +223,68 @@ workflow NANOSEQ{ .set { ch_sample } if (!params.skip_basecalling) { + if (params.input_path_file_type == 'fast5'){ + if (!params.skip_demultiplexing) { + ch_input_path + .map { it -> [ [id:'undemultiplexed'], it ] } + .set { ch_fast5_dir } + } else { + ch_sample + .set { ch_fast5_dir } + } - if (!params.skip_demultiplexing) { - ch_input_path - .map { it -> [ [id:'undemultiplexed'], it ] } - .set { ch_fast5_dir } + /* + * MODULE: Convert fast5 to pod5 + */ + FAST5_TO_POD5 ( ch_fast5_dir ) + ch_pod5 = FAST5_TO_POD5.out.pod5 } else { - ch_sample - .set { ch_fast5_dir } + if (!params.skip_demultiplexing) { + ch_input_path + .map { it -> [ [id:'undemultiplexed'], it ] } + .set { ch_pod5 } + } else { + ch_sample + .set { ch_pod5 } + } } - /* - * MODULE: Convert fast5 to pod5 - */ - FAST5_TO_POD5 ( ch_fast5_dir ) - ch_pod5 = FAST5_TO_POD5.out.pod5 - /* * MODULE: Basecalling and demultipexing using Dorado */ - DORADO ( ch_pod5, params.dorado_device, params.dorado_model ) - ch_software_versions = ch_software_versions.mix(DORADO.out.versions.ifEmpty(null)) - if (!params.skip_demultiplexing) { - - /* - * MODULE: Demultipexing using qcat - */ - ch_barcode_kit = Channel.from(params.barcode_kit) - - /* - * MODULE: Demultipexing using qcat - */ - QCAT ( DORADO.out.fastq , ch_barcode_kit ) - QCAT.out.reads - .map { it -> it[1] } - .flatten() - .map { it -> [ it.baseName.substring(0,it.baseName.lastIndexOf('.')), it ] } - .join(ch_sample.map{ meta, empty -> [meta.barcode, meta] }, by: [0] ) - .map { it -> [ it[2], it[1] ] } - .set { ch_fastq } // [ meta, .fastq.qz ] - ch_software_versions = ch_software_versions.mix(QCAT.out.versions.ifEmpty(null)) + DORADO_BASECALLER ( ch_pod5, params.dorado_device, params.dorado_model ) + ch_software_versions = ch_software_versions.mix(DORADO_BASECALLER.out.versions.ifEmpty(null)) + if (!params.bedmethyl_out) { + if (!params.skip_demultiplexing) { + + /* + * MODULE: Demultipexing using qcat + */ + ch_barcode_kit = Channel.from(params.barcode_kit) + + /* + * MODULE: Demultipexing using qcat + */ + QCAT ( DORADO_BASECALLER.out.dorado_out , ch_barcode_kit ) + QCAT.out.reads + .map { it -> it[1] } + .flatten() + .map { it -> [ it.baseName.substring(0,it.baseName.lastIndexOf('.')), it ] } + .join(ch_sample.map{ meta, empty -> [meta.barcode, meta] }, by: [0] ) + .map { it -> [ it[2], it[1] ] } + .set { ch_fastq } // [ meta, .fastq.qz ] + ch_software_versions = ch_software_versions.mix(QCAT.out.versions.ifEmpty(null)) + } else { + DORADO_BASECALLER.out.dorado_out + .set { ch_fastq } + } } else { - DORADO.out.fastq - .set { ch_fastq } + ch_fastq = Channel.empty() + ch_fasta = Channel.empty() + DORADO_ALIGNER( DORADO_BASECALLER.out.dorado_out, params.fasta ) + MODKIT_PILEUP ( DORADO_ALIGNER.out.aligned_bam ) } } else { @@ -301,6 +325,7 @@ workflow NANOSEQ{ } } + if (params.run_nanolyse) { if (!params.nanolyse_fasta) { if (!isOffline()) { @@ -337,20 +362,22 @@ workflow NANOSEQ{ ch_fastqc_multiqc = QCFASTQ_NANOPLOT_FASTQC.out.fastqc_multiqc.ifEmpty([]) } - ch_fasta = Channel.from( [id:'reference'], fasta ).collect() + if (!params.bedmethyl_out){ + ch_fasta = Channel.from( [id:'reference'], fasta ).collect() - /* + /* * SUBWORKFLOW: Make chromosome size file and covert GTF to BED12 */ - CUSTOM_GETCHROMSIZES( ch_fasta ) - ch_chr_sizes = CUSTOM_GETCHROMSIZES.out.sizes - ch_fai = CUSTOM_GETCHROMSIZES.out.fai - ch_software_versions = ch_software_versions.mix(CUSTOM_GETCHROMSIZES.out.versions.first().ifEmpty(null)) - - // will add the following in when nf-core/modules/minimap2/align supports junction bed input - //GTF2BED ( ch_chr_sizes ) - //ch_gtf_bed = GTF2BED.out.gtf_bed - //gtf2bed_version = GTF2BED.out.versions + CUSTOM_GETCHROMSIZES( ch_fasta ) + ch_chr_sizes = CUSTOM_GETCHROMSIZES.out.sizes + ch_fai = CUSTOM_GETCHROMSIZES.out.fai + ch_software_versions = ch_software_versions.mix(CUSTOM_GETCHROMSIZES.out.versions.first().ifEmpty(null)) + + // will add the following in when nf-core/modules/minimap2/align supports junction bed input + //GTF2BED ( ch_chr_sizes ) + //ch_gtf_bed = GTF2BED.out.gtf_bed + //gtf2bed_version = GTF2BED.out.versions + } ch_samtools_multiqc = Channel.empty() if (!params.skip_alignment) {