update for release 2.2.0

SouthGreenPlatform · Dec 16, 2022 · da0a948 · da0a948
1 parent e04e512
commit da0a948
Show file tree

Hide file tree

Showing 19 changed files with 97 additions and 23 deletions.
diff --git a/README.rst b/README.rst
@@ -36,8 +36,7 @@ possible assembly between all possibilities.
 Citation
 ________
 
-https://doi.org/10.1101/2021.07.19.452922 (V1)
-https://doi.org/10.24072/pcjournal.153 (V2)
+https://doi.org/10.1101/2021.07.19.452922
 
 Authors
 _______

diff --git a/culebrONT/VERSION b/culebrONT/VERSION
@@ -1 +1 @@
-2.1.1
+2.2.0
diff --git a/culebrONT/install_files/config.yaml b/culebrONT/install_files/config.yaml
@@ -74,14 +74,15 @@ params:
 
 
     #### CORRECTION
-    CORRECTION_MAKERANGE:
+    CORRECTION_SEGMENTATION:
         SEGMENT_LEN: 50000              # segment length to split assembly and correct it  default=50000
         OVERLAP_LEN: 200                # overlap length between segments  default=200
 
     NANOPOLISH:
         OPTIONS: ''
 
     MEDAKA:
+        SEGMENTATION: true              # if false, there is no segmentation of contigs for medaka
         MEDAKA_TRAIN_WITH_REF: false    # if 'MEDAKA_TRAIN_WITH_REF' is True, training uses reference to found in DATA REF param.
 
         # Medaka does not take in count other parameters below if MEDAKA_TRAIN_WITH_REF is TRUE.

diff --git a/culebrONT/install_files/config_test/config_A.yaml b/culebrONT/install_files/config_test/config_A.yaml
@@ -56,7 +56,7 @@ params:
         OPTIONS: 'useGrid=false maxThreads=20'
     SMARTDENOVO:
         KMER_SIZE: 16
-        OPTIONS: '-J 5000'
+        OPTIONS: '-J 5000 -e zmo'
     SHASTA:
         MEM_MODE: 'filesystem'
         MEM_BACKING: 'disk'
@@ -74,14 +74,15 @@ params:
 
 
     #### CORRECTION
-    CORRECTION_MAKERANGE:
+    CORRECTION_SEGMENTATION:
         SEGMENT_LEN: 50000              # segment length to split assembly and correct it  default=50000
         OVERLAP_LEN: 200                # overlap length between segments  default=200
 
     NANOPOLISH:
         OPTIONS: ''
 
     MEDAKA:
+        SEGMENTATION: false
         MEDAKA_TRAIN_WITH_REF: false    # if 'MEDAKA_TRAIN_WITH_REF' is True, training uses reference found in DATA REF param.
 
         # Medaka does not take in count other parameters below if MEDAKA_TRAIN_WITH_REF is TRUE.

diff --git a/culebrONT/install_files/config_test/config_AC.yaml b/culebrONT/install_files/config_test/config_AC.yaml
@@ -74,14 +74,15 @@ params:
 
 
     #### CORRECTION
-    CORRECTION_MAKERANGE:
+    CORRECTION_SEGMENTATION:
         SEGMENT_LEN: 50000              # segment length to split assembly and correct it  default=50000
         OVERLAP_LEN: 200                # overlap length between segments  default=200
 
     NANOPOLISH:
         OPTIONS: ''
 
     MEDAKA:
+        SEGMENTATION: false
         MEDAKA_TRAIN_WITH_REF: false    # if 'MEDAKA_TRAIN_WITH_REF' is True, training uses reference found in DATA REF param.
 
         # Medaka does not take in count other parameters below if MEDAKA_TRAIN_WITH_REF is TRUE.

diff --git a/culebrONT/install_files/config_test/config_AP.yaml b/culebrONT/install_files/config_test/config_AP.yaml
@@ -74,14 +74,15 @@ params:
 
 
     #### CORRECTION
-    CORRECTION_MAKERANGE:
+    CORRECTION_SEGMENTATION:
         SEGMENT_LEN: 50000              # segment length to split assembly and correct it  default=50000
         OVERLAP_LEN: 200                # overlap length between segments  default=200
 
     NANOPOLISH:
         OPTIONS: ''
 
     MEDAKA:
+        SEGMENTATION: false
         MEDAKA_TRAIN_WITH_REF: false    # if 'MEDAKA_TRAIN_WITH_REF' is True, training uses reference found in DATA REF param.
 
         # Medaka does not take in count other parameters below if MEDAKA_TRAIN_WITH_REF is TRUE.

diff --git a/culebrONT/install_files/config_test/config_APC.yaml b/culebrONT/install_files/config_test/config_APC.yaml
@@ -74,16 +74,16 @@ params:
 
 
     #### CORRECTION
-    CORRECTION_MAKERANGE:
+    CORRECTION_SEGMENTATION:
         SEGMENT_LEN: 50000              # segment length to split assembly and correct it  default=50000
         OVERLAP_LEN: 200                # overlap length between segments  default=200
 
     NANOPOLISH:
         OPTIONS: ''
 
     MEDAKA:
+        SEGMENTATION: true
         MEDAKA_TRAIN_WITH_REF: false    # if 'MEDAKA_TRAIN_WITH_REF' is True, training uses reference found in DATA REF param.
-
         # Medaka does not take in count other parameters below if MEDAKA_TRAIN_WITH_REF is TRUE.
         MEDAKA_MODEL_PATH: 'r941_min_high_g303' # use a path if you have downloaded a model (or you want to use your own trained model) OR a simple string like 'r941_min_high_g303'
         MEDAKA_FEATURES_OPTIONS: '--batch_size 10 --chunk_len 100 --chunk_ovlp 10'

diff --git a/culebrONT/install_files/config_test/config_APCQ-CIRQ.yaml b/culebrONT/install_files/config_test/config_APCQ-CIRQ.yaml
@@ -74,14 +74,15 @@ params:
 
 
     #### CORRECTION
-    CORRECTION_MAKERANGE:
+    CORRECTION_SEGMENTATION:
         SEGMENT_LEN: 50000              # segment length to split assembly and correct it  default=50000
         OVERLAP_LEN: 200                # overlap length between segments  default=200
 
     NANOPOLISH:
         OPTIONS: ''
 
     MEDAKA:
+        SEGMENTATION: true
         MEDAKA_TRAIN_WITH_REF: false    # if 'MEDAKA_TRAIN_WITH_REF' is True, training uses reference found in DATA REF param.
 
         # Medaka does not take in count other parameters below if MEDAKA_TRAIN_WITH_REF is TRUE.

diff --git a/culebrONT/install_files/config_test/config_APCQ.yaml b/culebrONT/install_files/config_test/config_APCQ.yaml
@@ -74,14 +74,15 @@ params:
 
 
     #### CORRECTION
-    CORRECTION_MAKERANGE:
+    CORRECTION_SEGMENTATION:
         SEGMENT_LEN: 50000              # segment length to split assembly and correct it  default=50000
         OVERLAP_LEN: 200                # overlap length between segments  default=200
 
     NANOPOLISH:
         OPTIONS: ''
 
     MEDAKA:
+        SEGMENTATION: false
         MEDAKA_TRAIN_WITH_REF: false    # if 'MEDAKA_TRAIN_WITH_REF' is True, training uses reference found in DATA REF param.
 
         # Medaka does not take in count other parameters below if MEDAKA_TRAIN_WITH_REF is TRUE.

diff --git a/culebrONT/install_files/config_test/config_APQ.yaml b/culebrONT/install_files/config_test/config_APQ.yaml
@@ -74,14 +74,15 @@ params:
 
 
     #### CORRECTION
-    CORRECTION_MAKERANGE:
+    CORRECTION_SEGMENTATION:
         SEGMENT_LEN: 50000              # segment length to split assembly and correct it  default=50000
         OVERLAP_LEN: 200                # overlap length between segments  default=200
 
     NANOPOLISH:
         OPTIONS: ''
 
     MEDAKA:
+        SEGMENTATION: false
         MEDAKA_TRAIN_WITH_REF: false    # if 'MEDAKA_TRAIN_WITH_REF' is True, training uses reference found in DATA REF param.
 
         # Medaka does not take in count other parameters below if MEDAKA_TRAIN_WITH_REF is TRUE.

diff --git a/culebrONT/schemas/config.schema.yaml b/culebrONT/schemas/config.schema.yaml
@@ -236,7 +236,7 @@ properties:
                     RACON_ROUNDS:
                         type: integer
                         description: "number of rounds of racon for polishing"
-            CORRECTION_MAKERANGE:
+            CORRECTION_SEGMENTATION:
                 type: object
                 additionalProperties: false
                 properties:
@@ -257,6 +257,9 @@ properties:
                 type: object
                 additionalProperties: false
                 properties:
+                    SEGMENTATION:
+                        type: boolean
+                        description: "active segmentation or not"
                     MEDAKA_TRAIN_WITH_REF:
                         type: boolean
                         description: "true if medaka should train"

diff --git a/culebrONT/snakefiles/assemblers.snake b/culebrONT/snakefiles/assemblers.snake
@@ -119,6 +119,7 @@ rule run_minimap_for_miniasm:
         paf = f"{output_dir}{{fastq}}/ASSEMBLERS/MINIASM/ASSEMBLER/output_minimap2.paf",
         version = f"{output_dir}{{fastq}}/ASSEMBLERS/MINIASM/ASSEMBLER/MINIMAP2-version.txt"
     log:
+        output =  f"{output_dir}{{fastq}}/ASSEMBLERS/MINIASM/ASSEMBLER/LOGS/{{fastq}}_MINIMAP4MINIASM.o",
         error =  f"{output_dir}{{fastq}}/ASSEMBLERS/MINIASM/ASSEMBLER/LOGS/{{fastq}}_MINIMAP4MINIASM.e",
     benchmark:
         f"{output_dir}{{fastq}}/BENCHMARK/ASSEMBLER/MINIASM.txt"
@@ -131,6 +132,7 @@ rule run_minimap_for_miniasm:
            output:
                paf: {output.paf}
            log:
+               output: {log.output}
                error: {log.error}
            """
     singularity:
@@ -157,6 +159,7 @@ rule run_miniasm:
         gfa_miniasm = f"{output_dir}{{fastq}}/ASSEMBLERS/MINIASM/ASSEMBLER/output_miniasm.gfa", # voir si à utiliser par gfapy
         version = f"{output_dir}{{fastq}}/ASSEMBLERS/MINIASM/ASSEMBLER/MINIASM-version.txt"
     log:
+        output =  f"{output_dir}{{fastq}}/ASSEMBLERS/MINIASM/ASSEMBLER/LOGS/{{fastq}}_MINIASM.o",
         error =  f"{output_dir}{{fastq}}/ASSEMBLERS/MINIASM/ASSEMBLER/LOGS/{{fastq}}_MINIASM.e",
     benchmark:
         f"{output_dir}{{fastq}}/BENCHMARK/ASSEMBLER/MINIMAP4MINIASM.txt"
@@ -170,6 +173,7 @@ rule run_miniasm:
            output:
                gfa: {output.gfa_miniasm}
            log:
+               output: {log.output}
                error: {log.error}
            """
     singularity:
@@ -203,6 +207,7 @@ rule run_minipolish:
     params:
         racon_rounds = config['params']['RACON']['RACON_ROUNDS'] if config['POLISHING']['RACON'] else 2,
     log:
+        output =  f"{output_dir}{{fastq}}/ASSEMBLERS/MINIASM/ASSEMBLER/LOGS/{{fastq}}_MINIASM_MINIPOLISH.o",
         error =  f"{output_dir}{{fastq}}/ASSEMBLERS/MINIASM/ASSEMBLER/LOGS/{{fastq}}_MINIASM_MINIPOLISH.e"
     benchmark:
         f"{output_dir}{{fastq}}/BENCHMARK/ASSEMBLER/MINIASM_MINIPOLISH.txt"
@@ -220,6 +225,7 @@ rule run_minipolish:
         params:
             racon_rounds: {params.racon_rounds}
         log:
+            output: {log.output}
             error: {log.error}
         """
     singularity:
@@ -316,7 +322,7 @@ rule run_smartdenovo:
         dir = f"{output_dir}{{fastq}}/ASSEMBLERS/SMARTDENOVO/ASSEMBLER",
         kmersize = f"16 " if f"{config['params']['SMARTDENOVO']['KMER_SIZE']}" == '' else f"{config['params']['SMARTDENOVO']['KMER_SIZE']}",
         options = f"{config['params']['SMARTDENOVO']['OPTIONS']}",
-        out_tmp= f"{output_dir}{{fastq}}/ASSEMBLERS/SMARTDENOVO/ASSEMBLER/SMART.dmo.cns"
+        out_tmp = f"{output_dir}{{fastq}}/ASSEMBLERS/SMARTDENOVO/ASSEMBLER/SMART.{'zmo' if 'zmo' in config['params']['SMARTDENOVO']['OPTIONS'] else 'dmo'}.cns"
     output:
         fasta = f"{output_dir}{{fastq}}/ASSEMBLERS/SMARTDENOVO/ASSEMBLER/assembly2Circ.fasta" if config['CIRCULAR'] else f"{output_dir}{{fastq}}/ASSEMBLERS/SMARTDENOVO/ASSEMBLER/assembly.fasta",
         version = f"{output_dir}{{fastq}}/ASSEMBLERS/SMARTDENOVO/ASSEMBLER/SMARTDENOVO-version.txt",

diff --git a/culebrONT/snakefiles/circular.snake b/culebrONT/snakefiles/circular.snake
@@ -176,7 +176,10 @@ def fasta_to_fixstart(wildcards):
         if wildcards.quality_step == 'NANOPOLISH':
             return rules.run_nanopolish_merge.output.fasta
         elif wildcards.quality_step == 'MEDAKA':
-            return rules.run_medaka_merge.output.fasta
+            if bool(config['params']['MEDAKA']['SEGMENTATION']):
+                return rules.run_medaka_merge.output.fasta
+            else:
+                return rules.run_medaka_consensus_oneshot.output.fasta
         elif wildcards.quality_step == f'PILON-{culebront.nb_pilon_rounds}':
             return f"{output_dir}{{fastq}}/ASSEMBLERS/{{assemblers}}/CORRECTION/PILON/pilon_{culebront.nb_pilon_rounds}/assembly.pilon{culebront.nb_pilon_rounds}.fasta"
     #POLISHING

diff --git a/culebrONT/snakefiles/correction.snake b/culebrONT/snakefiles/correction.snake
@@ -10,8 +10,8 @@ rule run_makerange:
         segments_list = f"{output_dir}{{fastq}}/ASSEMBLERS/{{assemblers}}/CORRECTION/MAKERANGE/segments.txt"
     params:
         dir = f"{output_dir}{{fastq}}/ASSEMBLERS/{{assemblers}}/CORRECTION/MAKERANGE/",
-        segment_len = config['params']['CORRECTION_MAKERANGE']['SEGMENT_LEN'],
-        overlap_len = config['params']['CORRECTION_MAKERANGE']['OVERLAP_LEN'],
+        segment_len = config['params']['CORRECTION_SEGMENTATION']['SEGMENT_LEN'],
+        overlap_len = config['params']['CORRECTION_SEGMENTATION']['OVERLAP_LEN'],
     log:
         output = f"{output_dir}{{fastq}}/ASSEMBLERS/{{assemblers}}/CORRECTION/MAKERANGE/LOGS/{{fastq}}_{{assemblers}}_MAKERANGE.o",
         error = f"{output_dir}{{fastq}}/ASSEMBLERS/{{assemblers}}/CORRECTION/MAKERANGE/LOGS/{{fastq}}_{{assemblers}}_MAKERANGE.e",
@@ -255,7 +255,6 @@ rule run_nanopolish_merge:
         nanopolish vcf2fasta --skip-checks -g {input.draft} {params.vcf_dir}*.nanopolish-segment.vcf 1>{output.fasta} 2>>{log.error}
         """
 
-
 ################################ MEDAKA ###################################
 rule index_fasta_to_correction:
     """
@@ -495,6 +494,55 @@ rule run_medaka_merge:
         medaka stitch --threads {threads} {params.hdf_dir}/*.medaka-segment.hdf {input.draft} {output.fasta} 1>{log.output} 2>{log.error}
         """
 
+rule run_medaka_consensus_oneshot:
+    """
+    launching Medaka Consensus (medaka version >=1.2) in oneshot for a fasta
+    """
+    threads: get_threads('run_medaka_consensus', 4)
+    input:
+        draft = rules.index_fasta_to_correction.output.fasta,
+        fastq= get_fastq,
+    output:
+        fasta = f"{output_dir}{{fastq}}/ASSEMBLERS/{{assemblers}}/CORRECTION/MEDAKA/consensusoneshot.fasta",
+        version= f"{output_dir}{{fastq}}/ASSEMBLERS/{{assemblers}}/CORRECTION/MEDAKA/MEDAKA-versiononeshot.txt",
+    params:
+        dir = directory(f"{output_dir}{{fastq}}/ASSEMBLERS/{{assemblers}}/CORRECTION/MEDAKA"),
+        fasta = f"{output_dir}{{fastq}}/ASSEMBLERS/{{assemblers}}/CORRECTION/MEDAKA/consensus.fasta",
+        options = config['params']['MEDAKA']['MEDAKA_CONSENSUS_OPTIONS'],
+        model = f"{rules.run_medaka_train.output.fasta_val_cat_acc if config['params']['MEDAKA']['MEDAKA_TRAIN_WITH_REF'] else config['params']['MEDAKA']['MEDAKA_MODEL_PATH']}",
+    log:
+        output = f"{output_dir}{{fastq}}/ASSEMBLERS/{{assemblers}}/CORRECTION/MEDAKA/LOGS/{{fastq}}_{{assemblers}}_MEDAKA_MERGEoneshot.o",
+        error=f"{output_dir}{{fastq}}/ASSEMBLERS/{{assemblers}}/CORRECTION/MEDAKA/LOGS/{{fastq}}_{{assemblers}}_MEDAKA_MERGEoneshot.e",
+    benchmark:
+        f"{output_dir}{{fastq}}/BENCHMARK/CORRECTION/MEDAKA/{{assemblers}}oneshot.txt",
+    message:
+        """
+        Launching {rule}.
+        threads: {threads}
+        input:
+            draft: {input.draft}
+            fastq: {input.fastq}
+        output:
+            consensus: {output.fasta}
+        params:
+            model: {params.model}
+        log:
+            output: {log.output}
+            error: {log.error}
+        """
+    singularity:
+        tools_config['SINGULARITY']['TOOLS']
+    envmodules:
+        tools_config["ENVMODULE"]["MEDAKA"]
+    shell:
+        """
+        [ ! -f {output.version} ] &&  [ ! -d versions ] && mkdir -p versions; medaka --version > {output.version}
+        ln -s -f {output.version} {output_dir}versions/MEDAKA-version.txt
+        (cd {params.dir};
+        medaka_consensus -i {input.fastq} -d {input.draft} -o {params.dir} -t {threads} -m {params.model}
+        mv {params.fasta} {output.fasta}) 1>{log.output} 2>{log.error}
+        """
+
 ################################ PILON ###################################
 
 rule run_pilon_first_round:
@@ -605,4 +653,4 @@ rule run_pilon:
         samtools index -@ {threads} {params.bam}
         pilon --genome {input.draft} --fix all --changes --bam {params.bam} --threads {threads} --output {params.rep} {params.options}) 1>{log.output} 2>{log.error}
         sed 's/_pilon//g' {output.pilon} > {output.corrected}
-        """
+        """
diff --git a/culebrONT/snakefiles/polishers.snake b/culebrONT/snakefiles/polishers.snake
@@ -29,6 +29,7 @@ rule run_racon:
             paf: {output.paf}
             fasta: {output.fasta}
         log:
+            output: {log.output}
             error: {log.error}
         """
     singularity:

diff --git a/culebrONT/snakefiles/quality.snake b/culebrONT/snakefiles/quality.snake
@@ -25,7 +25,10 @@ def fasta_to_quality(wildcards):
     elif wildcards.quality_step == 'NANOPOLISHsFIX':
         return f"{output_dir}{{fastq}}/ASSEMBLERS/{{assemblers}}/FIXSTART/NANOPOLISH-STARTFIXED.fasta"
     elif wildcards.quality_step == 'MEDAKA':
-        return rules.run_medaka_merge.output.fasta
+        if bool(config['params']['MEDAKA']['SEGMENTATION']):
+            return rules.run_medaka_merge.output.fasta
+        else:
+            return rules.run_medaka_consensus_oneshot.output.fasta
     elif wildcards.quality_step == 'MEDAKAsFIX':
         return f"{output_dir}{{fastq}}/ASSEMBLERS/{{assemblers}}/FIXSTART/MEDAKA-STARTFIXED.fasta"
     elif wildcards.quality_step == 'PILON':
@@ -478,6 +481,7 @@ rule combined_fastq:
         output:
             combined: {output.combined_data}
         log:
+            output: {log.output}
             error: {log.error}
         command:
             {params.command} {input.illumina_rep}/*{culebront.illumina_files_ext} > {output.combined_data}
@@ -495,7 +499,7 @@ rule run_KAT:
     This shows the completeness of an assembly, i.e. are all the reads assembled into contigs representative of the sequence data.
     https://kat.readthedocs.io/en/latest/using.html
     """
-    threads: get_threads('run_KAT', 4)
+    threads: get_threads('run_KAT', 1)
     input:
         fasta = rules.preparing_fasta_to_quality.output.renamed,
         combined_data = rules.combined_fastq.output.combined_data,

diff --git a/culebrONT/snakemake_scripts/Report.Rmd b/culebrONT/snakemake_scripts/Report.Rmd
@@ -1,5 +1,5 @@
 ---
-title: "CulebrONT 2.1.1 report"
+title: "CulebrONT 2.2.0 report"
 date: "`r format(Sys.time(), '%d %B, %Y')`"
 output:
   rmdformats::readthedown:

diff --git a/docs/source/INSTALL.rst b/docs/source/INSTALL.rst
@@ -276,6 +276,7 @@ This would save users a painful exploration of the snakefiles of CulebrONT.
    run_medaka_train
    run_medaka_consensus
    run_medaka_merge
+   run_medaka_oneshot
    run_pilon_first_round
    run_pilon
    run_racon

diff --git a/docs/source/WORKFLOWS.rst b/docs/source/WORKFLOWS.rst
@@ -111,6 +111,8 @@ You can manage tools parameters on the params section in the ``config.yaml`` fil
 
 ``Medaka`` specific options:
 
+* If 'SEGMENTATION' is false, there is no segmentation of contigs for medaka
+
 * If 'MEDAKA_TRAIN_WITH_REF' is activated, Medaka launchs the training using the reference found in 'DATA/REF' path parameter. Medaka will then not take into account other Medaka model parameters and will use the resulting trained model instead.
 
 * If 'MEDAKA_TRAIN_WITH_REF' is deactivated, Medaka does not launch training, but uses instead the model provided in 'MEDAKA_MODEL_PATH' parameter. Give to CulebrONT the path of the Medaka model *OR* just the model name in order to correct assemblies. This parameter could not be empty.