forked from epi2me-labs/wf-transcriptomes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
nextflow_schema.json
396 lines (396 loc) · 20.7 KB
/
nextflow_schema.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
{
"$schema": "http://json-schema.org/draft-07/schema",
"$id": "https://raw.githubusercontent.com/./master/nextflow_schema.json",
"title": "epi2me-labs/wf-transcriptomes",
"workflow_title": "Workflow Transcriptomes",
"description": "Transcriptome analysis including assembly and annotation of cDNA and direct RNA sequencing data, gene fusions and differential expression.",
"demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-transcriptomes/wf-transcriptomes-demo.tar.gz",
"aws_demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-transcriptomes/wf-transcriptomes-demo/aws.nextflow.config",
"url": "https://github.com/epi2me-labs/wf-transcriptomes",
"type": "object",
"definitions": {
"input_options": {
"title": "Input Options",
"type": "object",
"fa_icon": "fas fa-terminal",
"description": "Parameters for finding and handling input data for analysis.",
"properties": {
"fastq": {
"type": "string",
"format": "path",
"title": "FASTQ",
"demo_data": "${projectDir}/test_data/fastq",
"description": "FASTQ files to use in the analysis.",
"help_text": "This accepts one of three cases: (i) the path to a single FASTQ file; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`."
},
"transcriptome_source": {
"type": "string",
"enum": [
"precomputed",
"reference-guided"
],
"default": "reference-guided",
"description": "Select how the transcriptome used for analysis should be prepared.",
"help_text": "To analyse only gene fusions and differential expression use of an existing transcriptome may be preferred and so 'precomputed' should be selected. In this case the 'ref_transcriptome' parameter should be specified. To create a reference transcriptome using an existing reference genome, select 'reference guided' and specify the 'ref_genome' parameter."
},
"ref_genome": {
"type": "string",
"title": "Reference genome",
"format": "file-path",
"demo_data": "${projectDir}/test_data/SIRV_150601a.fasta",
"description": "Path to reference genome sequence [.fa/.fq/.fa.gz/fq.gz]. Required for reference-based workflow.",
"help_text": "A reference genome is required for reference-based assembly of a transcriptome."
},
"ref_transcriptome": {
"type": "string",
"title": "Reference transcriptome",
"format": "file-path",
"description": "Transcriptome reference file. Required for precomputed transcriptome calculation and for differential expression analysis.",
"help_text": "A reference transcriptome related to the sample under study. Must be supplied when the 'Transcriptome source' parameter has been set to 'precomputed' or to perform differential expression."
},
"ref_annotation": {
"type": "string",
"title": "Reference annotation",
"format": "file-path",
"demo_data": "${projectDir}/test_data/SIRV_isoforms.gtf",
"description": "A reference annotation in GFF2 or GFF3 format (extensions .gtf(.gz), .gff(.gz), .gff3(.gz)). Only annotation files from [Encode](https://www.encodeproject.org), [Ensembl](https://www.ensembl.org/index.html) and [NCBI](https://www.ncbi.nlm.nih.gov/) are supported.",
"help_text": "This will be used for guiding the transcriptome assembly and to label transcripts with their corresponding gene identifiers."
},
"direct_rna": {
"type": "boolean",
"default": false,
"title": "direct RNA",
"description": "Set to true for direct RNA sequencing.",
"help_text": " Omits the pychopper step."
},
"analyse_unclassified": {
"type": "boolean",
"default": false,
"title": "Analyse unclassified",
"description": "Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory.",
"help_text": "If selected and if the input is a multiplex directory the workflow will also process the unclassified directory."
}
},
"allOf": [
{
"required": [
"fastq",
"transcriptome_source"
]
}]
},
"output_options": {
"title": "Output Options",
"type": "object",
"description": "Parameters for saving and naming workflow outputs.",
"default": "",
"properties": {
"out_dir": {
"type": "string",
"format": "directory-path",
"default": "output",
"description": "Directory for output of all user-facing files."
}
}
},
"sample_options": {
"title": "Sample Options",
"type": "object",
"description": "Parameters that relate to samples such as sample sheets and sample names.",
"default": "",
"properties": {
"sample_sheet": {
"type": "string",
"title": "Sample and condition sheet",
"format": "file-path",
"description": "A CSV file used to map barcodes to sample aliases. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files. If you are running the differential expression workflow, there must be an additional column `condition` with two labels, one of which must be `control` (e.g. `control` and `treated`). Control will indicate which samples will be used as the reference. There should be at least 3 repeats for each condition.",
"help_text": "The sample sheet is a CSV file with, minimally, columns named `barcode` and `alias`. Extra columns are allowed."
},
"sample": {
"type": "string",
"description": "A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files."
}
}
},
"options_for_reference_based_workflow": {
"title": "Options for reference-based workflow",
"type": "object",
"description": "Parameters that are used solely for the reference-guided workflow",
"properties": {
"plot_gffcmp_stats": {
"type": "boolean",
"default": true,
"title": "Plot gffcompare statistics",
"description": "Create a PDF of plots from showing gffcompare results",
"help_text": "If set to true, a PDF file containing detailed gffcompare reults will be output"
},
"gffcompare_opts": {
"type": "string",
"title": "Plot gffcompare options",
"description": "Extra command-line options to give to gffcompare -r",
"default": "-R",
"help_text": "For a list of possible options see [gffcompare](https://ccb.jhu.edu/software/stringtie/gffcompare.shtml)."
},
"minimap2_index_opts": {
"type": "string",
"title": "Minimap2 index options",
"description": "Extra command-line options for minimap2 indexing.",
"default": "-k14",
"help_text": "See [minimap2 index options](https://lh3.github.io/minimap2/minimap2.html#4) for more information. These will only be relevant in the reference based transcriptome assembly."
},
"minimap2_opts": {
"type": "string",
"title": "Minimap2 options",
"description": "Additional command-line options for minimap2 alignment.",
"default": "-uf",
"help_text": "See [minimap2 options](https://lh3.github.io/minimap2/minimap2.html#5) for further information. These will only be relevant in the reference based transcriptome assembly."
},
"minimum_mapping_quality": {
"type": "integer",
"description": "filter aligned reads by MAPQ quality.",
"default": 40,
"help_text": "Reads that do not meet this mapping quality after minimap2 alignment, will be filtered out."
},
"poly_context": {
"type": "integer",
"description": "Region size at end of reads to apply poly(A) filter.",
"help_text": "Mispriming of polyT primers can occur when, instead of priming transcription from a polyA tail, it is primed from internal polyA rich regions in the genome. In these cases the 3` end of the transcript will not be captured and should be discarded. This parameter defines the size of a genomic region centered around the 3` alignment position from which to search for polyA rich regions.",
"hidden": true,
"default": 24
},
"max_poly_run": {
"type": "integer",
"title": "Maximum poly run",
"description": "Max poly(A) region allowed with poly_context-sized end regions.",
"help_text": "See `poly_context` parameter. This parameter defines the maximum allowed polyA tract within a `poly_context` defined genomic region.",
"hidden": true,
"default": 8
},
"stringtie_opts": {
"type": "string",
"title": "Stringtie options",
"description": "Extra command-line options for stringtie transcript assembly.",
"default": "--conservative",
"help_text": "For additional String tie options see [here](https://github.com/gpertea/stringtie#stringtie-options)."
}
}
},
"gene_fusion_detection_options": {
"title": "Gene Fusion Detection Options",
"type": "object",
"description": "Parameters for gene fusion detection",
"properties": {
"jaffal_refBase": {
"type": "string",
"title": "JAFFAL reference genome directory",
"format": "directory-path",
"description": "JAFFAl reference genome directory.",
"help_text": "JAFFAL human hg38 reference data directory can be downloaded from here: https://figshare.com/ndownloader/files/25410494 or see the README for alternative instructions. If custom gemome files are required, see the instructions here: https://github.com/Oshlack/JAFFA/wiki/FAQandTroubleshooting#how-can-i-generate-the-reference-files-for-a-non-supported-genome."
},
"jaffal_genome": {
"type": "string",
"title": "JAFFAL genome reference prefix",
"description": "Genome reference prefix. e.g. hg38.",
"help_text": "JAFFAL reference files are prefixed with the genome reference file name and need to be supplied . If using the human reference data provided by JAFFAL, this can be left at `hg38`.",
"default": "hg38"
},
"jaffal_annotation": {
"type": "string",
"title": "JAFFAL annotation suffix",
"description": "Annotation suffix.",
"help_text": "JAFFAL reference files are suffixed with the annotation filename and this needs to be supplied. For the human hg38 reference data supplied by JAFFAL, this is `genCode22`.",
"default": "genCode22"
},
"jaffal_dir": {
"type": "string",
"format": "directory-path",
"description": "Path to the JAFFAL code directory. If running within EPI2ME-Labs, the default path of /home/epi2melabs/JAFFA within the application container will be used. If using outside of EPI2ME-Labs, the path to the code directory downloaded from github should be supplied.",
"default": "/home/epi2melabs/JAFFA",
"hidden": true
}
}
},
"differential_expression_options": {
"title": "Differential Expression Options",
"type": "object",
"description": "Options relevant to the differential expression section of the workflow, only need to set if running DE.",
"default": "",
"properties": {
"de_analysis": {
"type": "boolean",
"default": false,
"title": "Differential expression analysis",
"description": "Run DE anaylsis",
"help_text": "Running this requires you to provide at least two replicates for a control and treated sample as well as a sample sheet param."
},
"min_gene_expr": {
"type": "integer",
"title": "Minimum gene expression",
"default": 10,
"description": "Minimum gene counts",
"help_text": "The minimum number of total mapped sequence reads for a gene to be considered expressed."
},
"min_feature_expr": {
"type": "integer",
"title": "Minimum feature expression",
"default": 3,
"description": "Minimum transcript counts",
"help_text": "The minimum number of total mapped sequence reads for a transcript to be considered."
},
"min_samps_gene_expr": {
"type": "integer",
"title": "Minimum samples with gene expression",
"description": "Genes expressed in a minimum of this many samples will be included in the differential expression analysis.",
"default": 3,
"help_text": "A gene must be mapped to at least this minimum number of samples for the gene be included in the analysis."
},
"min_samps_feature_expr": {
"type": "integer",
"title": "Minimum samples with feature expression",
"default": 1,
"description": "Transcripts expressed in minimum this many samples",
"help_text": "A transcript must be mapped in at least this this minimum number of samples to be included in the analysis."
}
}
},
"advanced_options": {
"title": "Advanced Options",
"type": "object",
"description": "Advanced options for configuring processes inside the workflow.",
"properties": {
"threads": {
"type": "integer",
"default": 4,
"description": "Number of CPU threads.",
"help_text": "Only provided to processes including alignment and and assembly that benefit from multiple threads."
},
"cdna_kit": {
"type": "string",
"title": "Kit used for cDNA synthesis.",
"enum": [
"SQK-PCS109",
"SQK-PCS110",
"SQK-PCS111",
"SQK-LSK114"
],
"description": "If cDNA reads are used, select the kit used.",
"default": "SQK-PCS109",
"help_text": "This will be used by pychopper to preprocess the reads for downstream analysis."
},
"pychopper_backend": {
"type": "string",
"enum": [
"edlib",
"phmm"
],
"title": "Pychopper backend",
"description": "Pychopper can use one of two available backends for identifying primers in the raw reads",
"default": "edlib",
"help_text": "'edlib' is set by default due to its high performance. However, it may be less sensitive than 'phmm'."
},
"pychopper_opts": {
"type": "string",
"title": "Pychopper options",
"description": "Extra pychopper opts",
"help_text": "See available options (here)[https://github.com/epi2me-labs/pychopper#usage]"
},
"bundle_min_reads": {
"type": "integer",
"default": 50000,
"title": "Bundle minimum reads",
"description": "Minimum size of bam bundle for parallel processing."
},
"isoform_table_nrows": {
"type": "integer",
"title": "Isoform table number of rows",
"description": "Maximum rows to dispay in the isoform report table",
"default": 5000
}
}
},
"miscellaneous_options": {
"title": "Miscellaneous Options",
"type": "object",
"description": "Everything else.",
"default": "",
"properties": {
"help": {
"type": "boolean",
"default": false,
"description": "Display help text.",
"fa_icon": "fas fa-question-circle",
"hidden": true
},
"disable_ping": {
"type": "boolean",
"default": false,
"description": "Enable to prevent sending a workflow ping."
},
"version": {
"type": "boolean",
"default": false,
"description": "Display version and exit.",
"hidden": true
}
}
}
},
"allOf": [
{
"$ref": "#/definitions/input_options"
},
{
"$ref": "#/definitions/output_options"
},
{
"$ref": "#/definitions/sample_options"
},
{
"$ref": "#/definitions/options_for_reference_based_workflow"
},
{
"$ref": "#/definitions/gene_fusion_detection_options"
},
{
"$ref": "#/definitions/differential_expression_options"
},
{
"$ref": "#/definitions/advanced_options"
},
{
"$ref": "#/definitions/miscellaneous_options"
}
],
"properties": {
"aws_image_prefix": {
"type": "string",
"hidden": true
},
"aws_queue": {
"type": "string",
"hidden": true
},
"monochrome_logs": {
"type": "boolean"
},
"validate_params": {
"type": "boolean",
"default": true
},
"show_hidden_params": {
"type": "boolean"
}
},
"resources": {
"recommended": {
"cpus": 16,
"memory": "32GB"
},
"minimum": {
"cpus": 8,
"memory": "16GB"
},
"run_time": "15 minutes per sample, with 1 million reads and recommended resources.",
"arm_support": false
}
}