nextflow_schema.json

{
    "$schema": "http://json-schema.org/draft-07/schema",
    "$id": "https://raw.githubusercontent.com/./master/nextflow_schema.json",
    "title": "epi2me-labs/wf-transcriptomes",
    "workflow_title": "Workflow Transcriptomes",
    "description": "Transcriptome analysis including assembly and annotation of cDNA and direct RNA sequencing data, gene fusions and differential expression.",
    "demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-transcriptomes/wf-transcriptomes-demo.tar.gz",
    "aws_demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-transcriptomes/wf-transcriptomes-demo/aws.nextflow.config",
    "url": "https://github.com/epi2me-labs/wf-transcriptomes",
    "type": "object",
    "definitions": {
        "input_options": {
            "title": "Input Options",
            "type": "object",
            "fa_icon": "fas fa-terminal",
            "description": "Parameters for finding and handling input data for analysis.",
            "properties": {
                "fastq": {
                    "type": "string",
                    "format": "path",
                    "title": "FASTQ",
                    "demo_data": "${projectDir}/test_data/fastq",
                    "description": "FASTQ files to use in the analysis.",
                    "help_text": "This accepts one of three cases: (i) the path to a single FASTQ file; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`."
                },
                "transcriptome_source": {
                    "type": "string",
                    "enum": [
                        "precomputed",
                        "reference-guided"
                    ],
                    "default": "reference-guided",
                    "description": "Select how the transcriptome used for analysis should be prepared.",
                    "help_text": "To analyse only gene fusions and differential expression use of an existing transcriptome may be preferred and so 'precomputed' should be selected. In this case the 'ref_transcriptome' parameter should be specified. To create a reference transcriptome using an existing reference genome, select 'reference guided' and specify the 'ref_genome' parameter."
                },
                "ref_genome": {
                    "type": "string",
                    "title": "Reference genome",
                    "format": "file-path",
                    "demo_data": "${projectDir}/test_data/SIRV_150601a.fasta",
                    "description": "Path to reference genome sequence [.fa/.fq/.fa.gz/fq.gz]. Required for reference-based workflow.",
                    "help_text": "A reference genome is required for reference-based assembly of a transcriptome."
                },
                "ref_transcriptome": {
                    "type": "string",
                    "title": "Reference transcriptome",
                    "format": "file-path",
                    "description": "Transcriptome reference file. Required for precomputed transcriptome calculation and for differential expression analysis.",
                    "help_text": "A reference transcriptome related to the sample under study. Must be supplied when the 'Transcriptome source' parameter has been set to 'precomputed' or to perform differential expression."
                },
                "ref_annotation": {
                    "type": "string",
                    "title": "Reference annotation",
                    "format": "file-path",
                    "demo_data": "${projectDir}/test_data/SIRV_isoforms.gtf",
                    "description": "A reference annotation in GFF2 or GFF3 format (extensions .gtf(.gz), .gff(.gz), .gff3(.gz)). Only annotation files from [Encode](https://www.encodeproject.org), [Ensembl](https://www.ensembl.org/index.html) and [NCBI](https://www.ncbi.nlm.nih.gov/) are supported.",
                    "help_text": "This will be used for guiding the transcriptome assembly and to label transcripts with their corresponding gene identifiers."
                },
                "direct_rna": {
                    "type": "boolean",
                    "default": false,
                    "title": "direct RNA",
                    "description": "Set to true for direct RNA sequencing.",
                    "help_text": " Omits the pychopper step."
                },
                "analyse_unclassified": {
                    "type": "boolean",
                    "default": false,
                    "title": "Analyse unclassified",
                    "description": "Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory.",
                    "help_text": "If selected and if the input is a multiplex directory the workflow will also process the unclassified directory."
                }
            },
            "allOf": [
                {
                    "required": [
                        "fastq",
                        "transcriptome_source"
                    ]
                }]
        },
        "output_options": {
            "title": "Output Options",
            "type": "object",
            "description": "Parameters for saving and naming workflow outputs.",
            "default": "",
            "properties": {
                "out_dir": {
                    "type": "string",
                    "format": "directory-path",
                    "default": "output",
                    "description": "Directory for output of all user-facing files."
                }
            }
        },
        "sample_options": {
            "title": "Sample Options",
            "type": "object",
            "description": "Parameters that relate to samples such as sample sheets and sample names.",
            "default": "",
            "properties": {
                "sample_sheet": {
                    "type": "string",
                    "title": "Sample and condition sheet",
                    "format": "file-path",
                    "description": "A CSV file used to map barcodes to sample aliases. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files. If you are running the differential expression workflow, there must be an additional column `condition` with two labels, one of which must be `control` (e.g. `control` and `treated`). Control will indicate which samples will be used as the reference. There should be at least 3 repeats for each condition.",
                    "help_text": "The sample sheet is a CSV file with, minimally, columns named `barcode` and `alias`. Extra columns are allowed."
                },
                "sample": {
                    "type": "string",
                    "description": "A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files."
                }
            }
        },
        "options_for_reference_based_workflow": {
            "title": "Options for reference-based workflow",
            "type": "object",
            "description": "Parameters that are used solely for the reference-guided workflow",
            "properties": {
                "plot_gffcmp_stats": {
                    "type": "boolean",
                    "default": true,
                    "title": "Plot gffcompare statistics",
                    "description": "Create a PDF of plots from showing gffcompare results",
                    "help_text": "If set to true, a PDF file containing detailed gffcompare reults will be output"
                },
                "gffcompare_opts": {
                    "type": "string",
                    "title": "Plot gffcompare options",
                    "description": "Extra command-line options to give to gffcompare -r",
                    "default": "-R",
                    "help_text": "For a list of possible options see [gffcompare](https://ccb.jhu.edu/software/stringtie/gffcompare.shtml)."
                },
                "minimap2_index_opts": {
                    "type": "string",
                    "title": "Minimap2 index options",
                    "description": "Extra command-line options for minimap2 indexing.",
                    "default": "-k14",
                    "help_text": "See [minimap2 index options](https://lh3.github.io/minimap2/minimap2.html#4) for more information. These will only be relevant in the reference based transcriptome assembly."
                },
                "minimap2_opts": {
                    "type": "string",
                    "title": "Minimap2 options",
                    "description": "Additional command-line options for minimap2 alignment.",
                    "default": "-uf",
                    "help_text": "See [minimap2 options](https://lh3.github.io/minimap2/minimap2.html#5) for further information. These will only be relevant in the reference based transcriptome assembly."
                },
                "minimum_mapping_quality": {
                    "type": "integer",
                    "description": "filter aligned reads by MAPQ quality.",
                    "default": 40,
                    "help_text": "Reads that do not meet this mapping quality after minimap2 alignment, will be filtered out."
                },
                "poly_context": {
                    "type": "integer",
                    "description": "Region size at end of reads to apply poly(A) filter.",
                    "help_text": "Mispriming of polyT primers can occur when, instead of priming transcription from a polyA tail, it is primed from internal polyA rich regions in the genome. In these cases the 3` end of the transcript will not be captured and should be discarded. This parameter defines the size of a genomic region centered around the 3` alignment position from which to search for polyA rich regions.",
                    "hidden": true,
                    "default": 24
                },
                "max_poly_run": {
                    "type": "integer",
                    "title": "Maximum poly run",
                    "description": "Max poly(A) region allowed with poly_context-sized end regions.",
                    "help_text": "See `poly_context` parameter. This parameter defines the maximum allowed polyA tract within a `poly_context` defined genomic region.",
                    "hidden": true,
                    "default": 8
                },
                "stringtie_opts": {
                    "type": "string",
                    "title": "Stringtie options",
                    "description": "Extra command-line options for stringtie transcript assembly.",
                    "default": "--conservative",
                    "help_text": "For additional String tie options see [here](https://github.com/gpertea/stringtie#stringtie-options)."
                }
            }
        },
        "gene_fusion_detection_options": {
            "title": "Gene Fusion Detection Options",
            "type": "object",
            "description": "Parameters for gene fusion detection",
            "properties": {
                "jaffal_refBase": {
                    "type": "string",
                    "title": "JAFFAL reference genome directory",
                    "format": "directory-path",
                    "description": "JAFFAl reference genome directory.",
                    "help_text": "JAFFAL human hg38 reference data directory can be downloaded from here: https://figshare.com/ndownloader/files/25410494 or see the README for alternative instructions. If custom gemome files are required, see the instructions here: https://github.com/Oshlack/JAFFA/wiki/FAQandTroubleshooting#how-can-i-generate-the-reference-files-for-a-non-supported-genome."
                },
                "jaffal_genome": {
                    "type": "string",
                    "title": "JAFFAL genome reference prefix",
                    "description": "Genome reference prefix. e.g. hg38.",
                    "help_text": "JAFFAL reference files are prefixed with the genome reference file name and need to be supplied . If using the human reference data provided by JAFFAL, this can be left at `hg38`.",
                    "default": "hg38"
                },
                "jaffal_annotation": {
                    "type": "string",
                    "title": "JAFFAL annotation suffix",
                    "description": "Annotation suffix.",
                    "help_text": "JAFFAL reference files are suffixed with the annotation filename and this needs to be supplied. For the human hg38 reference data supplied by JAFFAL, this is `genCode22`.",
                    "default": "genCode22"
                },
                "jaffal_dir": {
                    "type": "string",
                    "format": "directory-path",
                    "description": "Path to the JAFFAL code directory. If running within EPI2ME-Labs, the default path of /home/epi2melabs/JAFFA within the application container will be used. If using outside of EPI2ME-Labs, the path to the code directory downloaded from github should be supplied.",
                    "default": "/home/epi2melabs/JAFFA",
                    "hidden": true
                }
            }
        },
        "differential_expression_options": {
            "title": "Differential Expression Options",
            "type": "object",
            "description": "Options relevant to the differential expression section of the workflow, only need to set if running DE.",
            "default": "",
            "properties": {
                "de_analysis": {
                    "type": "boolean",
                    "default": false,
                    "title": "Differential expression analysis",
                    "description": "Run DE anaylsis",
                    "help_text": "Running this requires you to provide at least two replicates for a control and treated sample as well as a sample sheet param."
                },
                "min_gene_expr": {
                    "type": "integer",
                    "title": "Minimum gene expression",
                    "default": 10,
                    "description": "Minimum gene counts",
                    "help_text": "The minimum number of total mapped sequence reads for a gene to be considered expressed."
                },
                "min_feature_expr": {
                    "type": "integer",
                    "title": "Minimum feature expression",
                    "default": 3,
                    "description": "Minimum transcript counts",
                    "help_text": "The minimum number of total mapped sequence reads for a transcript to be considered."
                },
                "min_samps_gene_expr": {
                    "type": "integer",
                    "title": "Minimum samples with gene expression",
                    "description": "Genes expressed in a minimum of this many samples will be included in the differential expression analysis.",
                    "default": 3,
                    "help_text": "A gene must be mapped to at least this minimum number of samples for the gene be included in the analysis."
                },
                "min_samps_feature_expr": {
                    "type": "integer",
                    "title": "Minimum samples with feature expression",
                    "default": 1,
                    "description": "Transcripts expressed in minimum this many samples",
                    "help_text": "A transcript must be mapped in at least this this minimum number of samples to be included in the analysis."
                }
            }
        },
        "advanced_options": {
            "title": "Advanced Options",
            "type": "object",
            "description": "Advanced options for configuring processes inside the workflow.",
            "properties": {
                "threads": {
                    "type": "integer",
                    "default": 4,
                    "description": "Number of CPU threads.",
                    "help_text": "Only provided to processes including alignment and and assembly that benefit from multiple threads."
                },
                "cdna_kit": {
                    "type": "string",
                    "title": "Kit used for cDNA synthesis.",
                    "enum": [
                        "SQK-PCS109",
                        "SQK-PCS110",
                        "SQK-PCS111",
                        "SQK-LSK114"
                    ],
                    "description": "If cDNA reads are used, select the kit used.",
                    "default": "SQK-PCS109",
                    "help_text": "This will be used by pychopper to preprocess the reads for downstream analysis."
                },
                "pychopper_backend": {
                    "type": "string",
                    "enum": [
                        "edlib",
                        "phmm"
                    ],
                    "title": "Pychopper backend",
                    "description": "Pychopper can use one of two available backends for identifying primers in the raw reads",
                    "default": "edlib",
                    "help_text": "'edlib' is set by default due to its high performance. However, it may be less sensitive than 'phmm'."
                },
                "pychopper_opts": {
                    "type": "string",
                    "title": "Pychopper options",
                    "description": "Extra pychopper opts",
                    "help_text": "See available options (here)[https://github.com/epi2me-labs/pychopper#usage]"
                },
                "bundle_min_reads": {
                    "type": "integer",
                    "default": 50000,
                    "title": "Bundle minimum reads",
                    "description": "Minimum size of bam bundle for parallel processing."
                },
                "isoform_table_nrows": {
                    "type": "integer",
                    "title": "Isoform table number of rows",
                    "description": "Maximum rows to dispay in the isoform report table",
                    "default": 5000
                }
            }
        },
        "miscellaneous_options": {
            "title": "Miscellaneous Options",
            "type": "object",
            "description": "Everything else.",
            "default": "",
            "properties": {
                "help": {
                    "type": "boolean",
                    "default": false,
                    "description": "Display help text.",
                    "fa_icon": "fas fa-question-circle",
                    "hidden": true
                },
                "disable_ping": {
                    "type": "boolean",
                    "default": false,
                    "description": "Enable to prevent sending a workflow ping."
                },
                "version": {
                    "type": "boolean",
                    "default": false,
                    "description": "Display version and exit.",
                    "hidden": true
                }
            }
        }
    },
    "allOf": [
        {
            "$ref": "#/definitions/input_options"
        },
        {
            "$ref": "#/definitions/output_options"
        },
        {
            "$ref": "#/definitions/sample_options"
        },
        {
            "$ref": "#/definitions/options_for_reference_based_workflow"
        },
        {
            "$ref": "#/definitions/gene_fusion_detection_options"
        },
        {
            "$ref": "#/definitions/differential_expression_options"
        },
        {
            "$ref": "#/definitions/advanced_options"
        },
        {
            "$ref": "#/definitions/miscellaneous_options"
        }
    ],
    "properties": {
        "aws_image_prefix": {
            "type": "string",
            "hidden": true
        },
        "aws_queue": {
            "type": "string",
            "hidden": true
        },
        "monochrome_logs": {
            "type": "boolean"
        },
        "validate_params": {
            "type": "boolean",
            "default": true
        },
        "show_hidden_params": {
            "type": "boolean"
        }
    },
    "resources": {
        "recommended": {
            "cpus": 16,
            "memory": "32GB"
        },
        "minimum": {
            "cpus": 8,
            "memory": "16GB"
        },
        "run_time": "15 minutes per sample, with 1 million reads and recommended resources.",
        "arm_support": false
    }
}