forked from skchronicles/pyrkit
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpyrkit
executable file
·675 lines (564 loc) · 30.9 KB
/
pyrkit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
#!/usr/bin/env bash
# pyrkit: a tool to archive and co-locate NGS data with hierarchical metadata
set -euo pipefail
VERSION="1.1.0"
# Functions
function err() { cat <<< "$@" 1>&2; }
function fatal() { cat <<< "$@" 1>&2; exit 1; }
function log() { echo -e "[$(date +'%Y-%m-%dT%H:%M:%S%z')]: $*"; }
function abspath() { readlink -e "$1"; }
function parser() {
# Parses command-line args using argparse.bash
# @INPUT "$@" = user command-line arguments
ARGPARSE_DESCRIPTION="A tool to archive and co-locate NGS data with structured metadata"
ARGPARSE_EPILOG="Version $VERSION"
argparse "$@" << EOF || exit 0
# Required arguments group
required = parser.add_argument_group('Required Arguments')
optional = parser.add_argument_group('Optional Arguments')
required.add_argument('-i', '--input-directory', required=True, type=str,
help='Required local directory containing files (raw data and output files) \
to upload into object storage (HPC DME). This path is the absolute output \
directory of a pipeline. Example: -i /data/projects/ccbr123/RNA/')
required.add_argument('-o', '--output-vault', required=True, type=str,
help='Required vault in HPC DME to upload and archive local \
input files and metadata. This vault represents the root HPC DME \
path to archive the data located in --input-directory into object \
storage. CCBR has two main vaults: /CCBR_EXT_Archive and /CCBR_Archive. \
/CCBR_EXT_Archive is for storing any public data such as data or results \
from dbGap, SRA, GEO or EBI. /CCBR_Archive is for storing any other data \
such as data from internal (SF) and external sequencing providers \
(NovoGene, GeneDx, Macrogen, Genentech). Example: -o /CCBR_Archive')
required.add_argument('-r', '--request-template', required=True, type=str,
help='Required Project Request Template. The project request template is \
an excel spreadsheet sent out to the requestor to capture information about \
a project or experiment. This excel file is parsed to capture any required \
metadata for instantiating a PI-, Project-, and Sample-level collection hierarchy \
in HPC DME. Example: -t experiment_metadata.xlsx')
required.add_argument('-m', '--multiqc-directory', type=str,
help='Required MultiQC Output Directory. This directory is created by \
MultiQC. It contains several text files which are generated by MultiQC \
as it builds the report. These files are parsed to \
attach quality-control metadata to each Sample-level collection. \
Example: -m /data/projects/ccbr123/RNA/multiqc_data/')
required.add_argument('-d', '--dme-repo', required=True, type=str,
help='Required Path to a HPC DME command line toolkit installation. This path \
represents a local path to a git installation of DME command line interface. \
The DME command line toolkit must be installed to archive data into a \
vault. If you do not have toolkit installed, please visit this Getting \
Started page: wiki.nci.nih.gov/display/DMEdoc/Getting+Started \
Example: -d ~/DME/HPC_DME_APIs/')
# Optional arguments group
optional.add_argument('-p', '--project-id', type=str,
help='Optional Project ID. This is a unique identifer or alias tied to \
a request to internally distinguish a project. This could be a CCBR/NCBR/NAS project \
ID. Example: -p ccbr-123')
optional.add_argument('-n', '--dry-run', action = 'store_true', default = 'no',
help='Dry-run the entire pyrkit workflow. If this option is provided \
all the normal steps of the pyrkit workflow will be executed but data \
will NOT be pushed into HPC DME. This is useful for debugging purposes or \
if you are not ready to push everything into HPC DME. Example: --dry-run')
optional.add_argument('-l', '--local-run', action = 'store_true', default = 'no',
help='Local-run allows the user to upload the data lucally without submitting \
jobs to biowulf.')
optional.add_argument('-v', '--validate', action = 'store_true', default = 'no',
help='Include a validation step on the pipeline, where it searches if the PI_Lab \
already exists, and if so, search if the Project already exist as well. If the data \
matches some entries that are already stored at DME, it will show all of the metadata \
that will be append to existing project and which attributes are being modified, if any.')
optional.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS,
help='Display help message and exit')
optional.add_argument('--version', action='version',
version='%(prog)s $VERSION', help="Display version information and exit")
EOF
}
function retry() {
# Tries to run a cmd 5 times before failing
# If a command is successful, it will break out of attempt loop
# Failed attempts are padding with the following exponential
# back-off strategy {4, 16, 64, 256, 1024} in seconds
# @INPUTS "$@"" = cmd to run
# @CALLS fatal() if command cannot be run in 5 attempts
local n=1
local max=5
local attempt=true # flag for while loop
while $attempt; do
# Attempt command and break if successful
"$@" && attempt=false || {
# Try again up to 5 times
if [[ $n -le $max ]]; then
err "[$(timestamp)] Command failed: $@"
delay=$(( 4**$n ))
err "[$(timestamp)] Attempt: ${n}/${max}. Trying again in ${delay} seconds!"
sleep $delay;
((n++))
else
fatal "Fatal: the command has failed after max attempts!"
fi
}
done
}
function provided() {
# Checks to see if key,value pairs exist
# @INPUT $1 = name of user provided argument
# @INPUT $2 = value of user provided argument
# @CALLS fatal() if value is empty string or NULL
if [[ -z "${2:-}" ]]; then
fatal "Fatal: Failed to provide value to '${1}'!";
fi
}
function clean(){
# Finds the base name of the sample
# @INPUT $1 = From optional basename argument
# @RETURN $bname = cleaned base name (PATH and EXT removed)
local bname=${1:-}
local exts=("_1.fastq" "_2.fastq" ".R1.fastq" ".R2.fastq" "_R1.fastq" "_R2.fastq")
# Remove PATH and .gz extension
bname=$(basename "$bname" | sed 's/.gz$//g')
# Clean remaining extensions (MateInfo + )
for ext in "${exts[@]}"; do
if [[ $bname == *${ext} ]]; then
bname=$(echo "$bname" | sed "s@$ext\$@@")
break # only remove one extension
fi
done
echo "$bname"
}
function require(){
# Requires an executable is in $PATH, as a last resort it will attempt to load
# the executable or dependency as a module
# @INPUT $@ = List of dependencies or executables to check
for exe in "${@}"; do
# Check if executable is in $PATH
command -V ${exe} &> /dev/null && continue;
# Try to load exe as lua module
echo module load ${exe}
module load ${exe} &> /dev/null || \
fatal "Failed to find or load '${exe}', not installed on target system."
done
}
function init(){
# Intializes a list of directories
# @INPUT $@ = List of directories
for d in "${@}"; do
mkdir -p ${d} || fatal "Failed to initialize '${d}' directory";
done
}
function lint(){
# Lints user-provided project request template (experiment_metadata.xlsx)
# Check for basic errors and if user provided all the require metadata
# See --request-template or $REQUEST_TEMPLATE in help for more information
# @INPUT $1 = PATH to pyrkit/src/lint.py program
# @INPUT $2 = Project Request Template files to parse and lint (i.e. $REQUEST_TEMPLATE)
# @INPUT $3 = Output Directory
python "${1}" "${2}" "${3}"
}
function _get_sample_groups(){
# Parses sample group information from sample.json (generated by running lint)
# @INPUT $1 = Input Directory containing sample.json generated by @init()
# @INPUT $2 = MultiQC Output Directory
# @INPUT $3 = Parsed Sample Group Output file
# Performant approach to checking if glob exists using a Bash built-in function
if compgen -G "${1}/sample.json" > /dev/null; then
echo -e "Sample\tTissueType" > "${2}/${3}" || fatal "Failed to write to MultiQC Directory ${2}"
# Parse Group informaion from sample.json with jq
jq -r 'to_entries[] | [.value."Sample Name", .value.Group] | @tsv' "${1}/sample.json" \
| sed 's/\tnan$/\tUnknown/g' >> "${2}/${3}"
fi
}
function parse(){
# Parse additional output files that MultiQC does not automatically parse
# @INPUT $1 = Input Directory or pipeline working directory (i.e. $INPUT_DIRECTORY)
# @INPUT $2 = MultiQC Directory (i.e. $MULTIQC_DIRECTORY)
# Parse Addtional metadata from logfiles
_get_sample_groups "${1}/DME" "${2}" "sample_group.txt"
}
function QC(){
# Aggregates MultiQC information across all samples and generates a QC Table
# @INPUT $1 = PATH to pyrkit/src/pyparser.py program
# @INPUT $2 = MultiQC Directory (i.e. $MULTIQC_DIRECTORY)
# Run in sub-shell to keep current working diretory
(cd "${2}"; python "${1}" "${2}"/*.txt )
}
function validate(){
# Validate the user-provided metadata
# Check for basic errors and if user provided all the require metadata
# See --request-template or $REQUEST_TEMPLATE in help for more information
# @INPUT $1 = PATH to pyrkit/src/validate.py program
# @INPUT $2 = PATH to the directory that will be uploaded to DME
# @INPUT $3 = DME Vault validate the data (i.e. /CCBR_Archive or /CCBR_EXT_Archive)
python "${1}" "${2}" "${3}"
}
function fingerprint(){
# Generates a Unique Identifer for an Analysis
# Analysis ID is determinstic and based on user inputs to pipeline
# @INPUT $1 = Input Directory or pipeline working directory (i.e. $INPUT_DIRECTORY)
# @INPUT $2 = DME base directory for all intermediate output files
# @RETURNS inputs_md5, analysis_id, assembly_name, gtf_ver
# run_metadata.txt aggregates all important user inputs and pipeline options
local pipeline_version=$(jq .project.version "${1}/config.json" | sed 's/"//g' | sed "s/'//g")
local nsamples=$(jq .project.groups.rsamps[] "${1}/config.json" | sed 's/"//g' | sed "s/'//g" | wc -l)
local runtype=$(jq .project.nends "${1}/config.json" | sed 's/1/single-end/g' | sed 's/2/paired-end/g')
local refjson=$(jq .project.annotation "${1}/config.json" | sed 's/"//g' | sed "s/'//g")
local genomefa=$(jq .references.rnaseq.GENOME "$refjson" | sed 's/"//g' | sed "s/'//g")
local gtf=$(jq .references.rnaseq.GTFFILE "$refjson" | sed 's/"//g' | sed "s/'//g")
local gtf_ver=$(jq .references.rnaseq.ORGANISM "$refjson" | sed 's/"//g' | sed "s/'//g" | awk -F '_' '{print $NF}')
local assembly_name=$(jq .references.rnaseq.ORGANISM "$refjson" | sed 's/"//g' | sed "s/'//g" | awk -F '_' '{print $1}')
local species=$(jq '.Project."Organism"[]' ${2}/project.json | sed 's/"//g' | sed "s/'//g")
#local method=$(jq '.Project."Type of Project"[]' ${2}/project.json | sed 's/"//g' | sed "s/'//g")
local method=$(jq '.Project."Library Strategy"[]' ${2}/project.json | sed 's/"//g' | sed "s/'//g")
echo -e "pipeline_ver\t${pipeline_version}" > "${2}/run_metadata.txt"
echo -e "number_of_cases\t${nsamples}" >> "${2}/run_metadata.txt"
echo -e "runtype\t${runtype}" >> "${2}/run_metadata.txt"
echo -e "method\t${method}" >> "${2}/run_metadata.txt"
echo -e "gtf\t${gtf}" >> "${2}/run_metadata.txt"
echo -e "gtf_ver\tv${gtf_ver/v}" >> "${2}/run_metadata.txt"
echo -e "assembly_name\t${assembly_name}" >> "${2}/run_metadata.txt"
echo -e "genomefa\t${genomefa}" >> "${2}/run_metadata.txt"
echo -e "species\t${species^}" >> "${2}/run_metadata.txt"
for f in "${1}"/*.R?.fastq.gz; do
# Find a list of input FastQ files
echo -e "file\t${f}" >> "${2}/run_metadata.txt"
done
# Convert Input Files to MD5 checksums
while read -r field value; do
# Get MD5 checksum if evaluating an input file
ifile=$(if [[ -f "$value" ]]; then md5sum "$value" | awk '{print $1}'; else echo "$value"; fi)
echo -e "$field\t$ifile"
done < <(grep -v '^gtf_ver\|^assembly_name' "${2}/run_metadata.txt" | sort -k2,2) > "${2}/run_inputs.md5"
# Add MD5 checksum of all inputs and create an analysis id to run metadata
inputs_md5=$(md5sum "${2}/run_inputs.md5" | awk '{print $1}')
analysis_id="${inputs_md5:0:3}-${inputs_md5:${#inputs_md5}/2:2}-${inputs_md5:${#inputs_md5}-4}"
echo -e "md5_all_inputs\t${inputs_md5}" >> "${2}/run_metadata.txt"
echo -e "md5_all_inputs_serial\t${analysis_id}" >> "${2}/run_metadata.txt"
# Get assembly_name and gtf_ver from run inputs
assembly_name=$( (grep '^assembly_name' "${2}/run_metadata.txt" || echo "custom") | awk -F '\t' '{print $NF}')
gtf_ver=$( (grep '^gtf_ver' "${2}/run_metadata.txt" || echo "custom") | awk -F '\t' '{print $NF}')
# Return inputs_md5, analysis_id, assembly_name, gtf_ver
echo -e "${inputs_md5}\t${analysis_id}\t${assembly_name}\t${gtf_ver}"
}
function collections(){
# Generates collection metadata and hierarchy for DME upload
# @INPUT $1 = PATH to pyrkit/src/initialize.py program
# @INPUT $2 = DME base directory for all intermediate output files
# @INPUT $3 = DME Vault to push data (i.e. /CCBR_Archive or /CCBR_EXT_Archive)
# @INPUT $4 = MultiQC Directory (i.e. $MULTIQC_DIRECTORY)
# @INPUT $5 = Project ID (Optional)
# @RETURNS ${analysis_home} or the root of the new DME collection
# Project ID may not be set if it is not provided
local project_id_option
project_id_option=${5:-}
# Creating string for optional cli argument
if [[ -n "${project_id_option}" ]]; then
project_id_option="-p ${project_id_option}"
fi
( # Initializes Collection heirarchy from project metadata template
python "${1}" "${2}" "${2}/upload" "${3#/}" --convert \
-a "${2}/run_metadata.txt" \
-m "${4}/multiqc_matrix.tsv" \
${project_id_option} 1>&2
)
# Return Primary Analysis Collection Name
analysis_home=$(cd "${2}"; find upload/PI_Lab_*/Project_*/ -type d -iname 'Primary_Analysis_*')
echo "${analysis_home}"
}
function _sym_link_fastqs(){
# Symlinks a Samples FastQ files into their mock upload sample collection and
# generates data-object metadata for DME upload
# @INPUT $1 = Input Directory or pipeline working directory (i.e. $INPUT_DIRECTORY)
# @INPUT $2 = DME base directory for all intermediate output files (i.e. "$INPUT_DIRECTORY/DME")
# @INPUT $3 = DME Vault to push data (i.e. /CCBR_Archive or /CCBR_EXT_Archive)
# @INPUT $4 = PATH to pyrkit/src/meta program
for f in "${1}"/*.R?.fastq.gz; do
# Find a FastQ files mock Sample Collection
sample=$(basename "$f"| sed 's/\.R.\.fastq\.gz//')
rawdir=$(ls --color=never -d "${2}"/upload/PI_Lab_*/Project_*/Sample_*_"${sample}")
ln -s "$f" "$rawdir" || echo "Failed to create symlink for $f and $rawdir";
fname=$(basename "$f"); dmepath=$(echo "$rawdir" | sed "s@^${2}/upload@${3%/}@")
# Generate dataobject metadata for FastQ files
python "${4}" sample -i "$rawdir/$fname" -o "$dmepath" -s "$sample"
done
}
function _sym_link_gbam(){
# Symlinks a Samples Genomic BAM files into their mock upload sample collection and
# generates data-object metadata for DME upload
# @INPUT $1 = Input Directory or pipeline working directory (i.e. $INPUT_DIRECTORY)
# @INPUT $2 = DME base directory for all intermediate output files (i.e. "$INPUT_DIRECTORY/DME")
# @INPUT $3 = DME Vault to push data (i.e. /CCBR_Archive or /CCBR_EXT_Archive)
# @INPUT $4 = PATH to pyrkit/src/meta program
# @INPUT $5 = Assembly Name (i.e. mm10)
# @INPUT $6 = GTF Version (i.e. M21)
# @INPUT $7 = Short Analysis ID (i.e. f63-93-b750)
# @INPUT $8 = Long Analysis ID (i.e. f63ab9966e22f548934c31172388b750)
# @INPUT $9 = DME Primary Analysis Collection Path associated with a sample
for f in "${1}"/bams/*.star_rg_added.sorted.dmark.bam; do
# Find a FastQ files mock Sample Collection
sample=$(basename "$f" | sed 's/\.star_rg_added\.sorted\.dmark\.bam$//')
rawdir=$(ls --color=never -d "${2}"/upload/PI_Lab_*/Project_*/Sample_*_"${sample}")
# Create renamed symlinks with assembly, gtf ver and analysis_id
ln -s "$f" "${rawdir}/${sample}.${5}_${6}.Aligned.toGenome.sorted.dmark.${7}.bam" \
|| echo "Failed to create symlink for $f and $rawdir";
dmepath=$(echo "$rawdir" | sed "s@^${2}/upload@${3%/}@")
# Generate dataobject metadata for FastQ files
python "${4}" sample -i "$rawdir/${sample}.${5}_${6}.Aligned.toGenome.sorted.dmark.${7}.bam" \
-o "$dmepath" -s "$sample" -a "${8}" -d "${9}"
done
}
function _sym_link_tbam(){
# Symlinks a Samples Transcriptomic BAM files into their mock upload sample collection and
# generates data-object metadata for DME upload
# @INPUT $1 = Input Directory or pipeline working directory (i.e. $INPUT_DIRECTORY)
# @INPUT $2 = DME base directory for all intermediate output files (i.e. "$INPUT_DIRECTORY/DME")
# @INPUT $3 = DME Vault to push data (i.e. /CCBR_Archive or /CCBR_EXT_Archive)
# @INPUT $4 = PATH to pyrkit/src/meta program
# @INPUT $5 = Assembly Name (i.e. mm10)
# @INPUT $6 = GTF Version (i.e. M21)
# @INPUT $7 = Short Analysis ID (i.e. f63-93-b750)
# @INPUT $8 = Long Analysis ID (i.e. f63ab9966e22f548934c31172388b750)
# @INPUT $9 = DME Primary Analysis Collection Path associated with a sample
for f in "${1}"/bams/*.p2.Aligned.toTranscriptome.out.bam; do
# Find a FastQ files mock Sample Collection
sample=$(basename "$f" | sed 's/\.p2\.Aligned\.toTranscriptome\.out\.bam$//')
if [[ "$sample" == '*' ]]; then
continue
fi
rawdir=$(ls --color=never -d "${2}"/upload/PI_Lab_*/Project_*/Sample_*_"${sample}")
# Create renamed symlinks with assembly, gtf ver and analysis_id
ln -s "$f" "${rawdir}/${sample}.${5}_${6}.Aligned.toTranscriptome.${7}.bam" \
|| echo "Failed to create symlink for $f and $rawdir";
dmepath=$(echo "$rawdir" | sed "s@^${2}/upload@${3%/}@")
# Generate dataobject metadata for FastQ files
python "${4}" sample -i "$rawdir/${sample}.${5}_${6}.Aligned.toTranscriptome.${7}.bam" \
-o "$dmepath" -s "$sample" -a "${8}" -d "${9}"
done
}
function _sym_link_cbam(){
# Symlinks a Samples Chimeric BAM files into their mock upload sample collection and
# generates data-object metadata for DME upload
# @INPUT $1 = Input Directory or pipeline working directory (i.e. $INPUT_DIRECTORY)
# @INPUT $2 = DME base directory for all intermediate output files (i.e. "$INPUT_DIRECTORY/DME")
# @INPUT $3 = DME Vault to push data (i.e. /CCBR_Archive or /CCBR_EXT_Archive)
# @INPUT $4 = PATH to pyrkit/src/meta program
# @INPUT $5 = Assembly Name (i.e. mm10)
# @INPUT $6 = GTF Version (i.e. M21)
# @INPUT $7 = Short Analysis ID (i.e. f63-93-b750)
# @INPUT $8 = Long Analysis ID (i.e. f63ab9966e22f548934c31172388b750)
# @INPUT $9 = DME Primary Analysis Collection Path associated with a sample
for f in "${1}"/fusions/*.p2.arriba.Aligned.sortedByCoord.out.bam; do
# Find a FastQ files mock Sample Collection
sample=$(basename "$f" | sed 's/\.p2\.arriba\.Aligned\.sortedByCoord\.out\.bam$//')
rawdir=$(ls --color=never -d "${2}"/upload/PI_Lab_*/Project_*/Sample_*_"${sample}")
# Create renamed symlinks with assembly, gtf ver and analysis_id
ln -s "$f" "${rawdir}/${sample}.${5}_${6}.Aligned.toChimeric.${7}.bam" \
|| echo "Failed to create symlink for $f and $rawdir";
dmepath=$(echo "$rawdir" | sed "s@^${2}/upload@${3%/}@")
# Generate dataobject metadata for FastQ files
python "${4}" sample -i "$rawdir/${sample}.${5}_${6}.Aligned.toChimeric.${7}.bam" \
-o "$dmepath" -s "$sample" -a "${8}" -d "${9}"
done
}
function _sym_link_arriba_fusions(){
# Symlinks a Samples Arriba's predicted gene fusions into their mock upload sample collection and
# generates data-object metadata for DME upload
# @INPUT $1 = Input Directory or pipeline working directory (i.e. $INPUT_DIRECTORY)
# @INPUT $2 = DME base directory for all intermediate output files (i.e. "$INPUT_DIRECTORY/DME")
# @INPUT $3 = DME Vault to push data (i.e. /CCBR_Archive or /CCBR_EXT_Archive)
# @INPUT $4 = PATH to pyrkit/src/meta program
# @INPUT $5 = Assembly Name (i.e. mm10)
# @INPUT $6 = GTF Version (i.e. M21)
# @INPUT $7 = Short Analysis ID (i.e. f63-93-b750)
# @INPUT $8 = Long Analysis ID (i.e. f63ab9966e22f548934c31172388b750)
# @INPUT $9 = DME Primary Analysis Collection Path associated with a sample
for f in "${1}"/fusions/*_fusions.tsv; do
# Find a FastQ files mock Sample Collection
sample=$(basename "$f" | sed 's/_fusions\.tsv$//')
rawdir=$(ls --color=never -d "${2}"/upload/PI_Lab_*/Project_*/Sample_*_"${sample}")
# Create renamed symlinks with assembly, gtf ver and analysis_id
ln -s "$f" "${rawdir}/${sample}.${5}_${6}.arriba.fusions.${7}.tsv" \
|| echo "Failed to create symlink for $f and $rawdir";
dmepath=$(echo "$rawdir" | sed "s@^${2}/upload@${3%/}@")
# Generate dataobject metadata for FastQ files
python "${4}" sample -i "${rawdir}/${sample}.${5}_${6}.arriba.fusions.${7}.tsv" \
-o "$dmepath" -s "$sample" -a "${8}" -d "${9}"
done
}
function _sym_link_arriba_pdfs(){
# Symlinks a Samples Arriba's predicted gene fusions into their mock upload sample collection and
# generates data-object metadata for DME upload
# @INPUT $1 = Input Directory or pipeline working directory (i.e. $INPUT_DIRECTORY)
# @INPUT $2 = DME base directory for all intermediate output files (i.e. "$INPUT_DIRECTORY/DME")
# @INPUT $3 = DME Vault to push data (i.e. /CCBR_Archive or /CCBR_EXT_Archive)
# @INPUT $4 = PATH to pyrkit/src/meta program
# @INPUT $5 = Assembly Name (i.e. mm10)
# @INPUT $6 = GTF Version (i.e. M21)
# @INPUT $7 = Short Analysis ID (i.e. f63-93-b750)
# @INPUT $8 = Long Analysis ID (i.e. f63ab9966e22f548934c31172388b750)
# @INPUT $9 = DME Primary Analysis Collection Path associated with a sample
for f in "${1}"/fusions/*_fusions.arriba.pdf; do
# Find a FastQ files mock Sample Collection
sample=$(basename "$f" | sed 's/_fusions\.arriba.pdf$//')
rawdir=$(ls --color=never -d "${2}"/upload/PI_Lab_*/Project_*/Sample_*_"${sample}")
# Create renamed symlinks with assembly, gtf ver and analysis_id
ln -s "$f" "${rawdir}/${sample}.${5}_${6}.arriba.fusions.${7}.pdf" \
|| echo "Failed to create symlink for $f and $rawdir";
dmepath=$(echo "$rawdir" | sed "s@^${2}/upload@${3%/}@")
# Generate dataobject metadata for FastQ files
python "${4}" sample -i "${rawdir}/${sample}.${5}_${6}.arriba.fusions.${7}.pdf" \
-o "$dmepath" -s "$sample" -a "${8}" -d "${9}"
done
}
function links(){
# Creates symlinks for files to upload into DME
# @INPUT $1 = Input Directory or pipeline working directory (i.e. $INPUT_DIRECTORY)
# @INPUT $2 = DME base directory for all intermediate output files (i.e. "$INPUT_DIRECTORY/DME")
# @INPUT $3 = DME Vault to push data (i.e. /CCBR_Archive or /CCBR_EXT_Archive)
# @INPUT $4 = PATH to pyrkit/src/meta program
# @INPUT $5 = Assembly Name (i.e. mm10)
# @INPUT $6 = GTF Version (i.e. M21)
# @INPUT $7 = Short Analysis ID (i.e. f63-93-b750)
# @INPUT $8 = Long Analysis ID (i.e. 26071405f2f1c3a6f71d4141edb208e2)
# @INPUT $9 = DME Primary Analysis Collection Path associated with a sample
_sym_link_fastqs "${1}" "${2}" "${3}" "${4}"
_sym_link_gbam "${1}" "${2}" "${3}" "${4}" "${5}" "${6}" "${7}" "${8}" "${9}"
_sym_link_tbam "${1}" "${2}" "${3}" "${4}" "${5}" "${6}" "${7}" "${8}" "${9}"
_sym_link_cbam "${1}" "${2}" "${3}" "${4}" "${5}" "${6}" "${7}" "${8}" "${9}"
_sym_link_arriba_fusions "${1}" "${2}" "${3}" "${4}" "${5}" "${6}" "${7}" "${8}" "${9}"
_sym_link_arriba_pdfs "${1}" "${2}" "${3}" "${4}" "${5}" "${6}" "${7}" "${8}" "${9}"
}
function fix(){
# Fixes formating of counts matrices to be compatible for downstream analysis
# @INPUT List of count matrices to reformat
for f in "${@}"; do
paste <(awk -F '\t' -v OFS='\t' '{print $1"|"$2}' $f | sed '1 s/gene_id|GeneName/symbol/') \
<(cut -f3- $f) > "$f.tmp"
( # Remove suffix expected_counts from each sample name
head -1 "$f.tmp" \
| tr '\t' '\n' \
| sed 's/_expected_count$//g' \
| tr '\n' '\t' \
| sed 's/.$/\n/';
tail -n+2 "$f.tmp"
) > "$f.tmp2"
mv "$f.tmp2" "$f"; rm "$f.tmp"
done
}
function multi(){
# Prepares multi-sample results or files for upload into Primary Analysis collection
# @INPUT $1 = Input Directory or pipeline working directory (i.e. $INPUT_DIRECTORY)
# @INPUT $2 = Path to Primary Analysis Collection directory on local filesystem
# @INPUT $3 = MultiQC Directory (i.e. $MULTIQC_DIRECTORY)
# @INPUT $4 = Project Request Template (i.e. $REQUEST_TEMPLATE)
# @INPUT $5 = PATH to pyrkit/src/meta program
# @INPUT $6 = DME Primary Analysis Collection Path
# @INPUT $7 = Long Analysis ID (i.e. f63ab9966e22f548934c31172388b750)
# Add Counts Matrices (Gene and Isoform Counts), TIN counts, MultiQC Report and TSV, Project Request Spreadsheet
cp "${1}/DEG_ALL/RSEM.genes.expected_count.all_samples.txt" "${2}/RSEM_genes_expected_counts.tsv"
cp "${1}/DEG_ALL/RSEM.isoforms.expected_count.all_samples.txt" "${2}/RSEM_isoforms_expected_counts.tsv"
cp "${1}/DEG_ALL/RSEM.genes.FPKM.all_samples.txt" "${2}/RSEM_genes_FPKM_normalized.tsv"
cp "${1}/DEG_ALL/RSEM.isoforms.FPKM.all_samples.txt" "${2}/RSEM_isoforms_FPKM_normalized.tsv"
cp "${1}/DEG_ALL/RSEM.genes.TPM.all_samples.txt" "${2}/RSEM_genes_TPM_normalized.tsv"
cp "${1}/DEG_ALL/RSEM.isoforms.TPM.all_samples.txt" "${2}/RSEM_isoforms_TPM_normalized.tsv"
# Reformat for downstream analysis and remove suffix expected_counts
fix "${2}/RSEM_genes_expected_counts.tsv" "${2}/RSEM_isoforms_expected_counts.tsv" \
"${2}/RSEM_genes_FPKM_normalized.tsv" "${2}/RSEM_isoforms_FPKM_normalized.tsv" \
"${2}/RSEM_genes_TPM_normalized.tsv" "${2}/RSEM_isoforms_TPM_normalized.tsv"
# Symlink remaining files to Primary Analysis collection
mqc_prefix=$(basename "${1}")
for f in "${3}/multiqc_matrix.tsv" "${1}/Reports/multiqc_report.html" "${1}/Reports/RNA_Report.html" "${4}"; do
ln -s "${f}" "${2}/" || echo "Failed to create symlink for $f and ${2}";
done
# Generate dataobject metadata files for aggregate or multi-sample data
while read -r f; do
python "${5}" combined -i "$f" -o "${6}" -a "${7}"
done < <(find "${2}" -not -type d -not -iname '*.metadata.json')
}
dryrun(){
# Dry-runs dm_register_directory command for pushing local data to HPC DME
# @INPUT $1 = DME base directory for all intermediate output files (i.e. ${INPUT_DIRECTORY})
# @INPUT $2 = Path to local git installation of DME CLU toolkit
# @INPUT $3 = DME Vault to push data (i.e. /CCBR_Archive or /CCBR_EXT_Archive)
( # Goto DME base directory for all intermediate output files
cd "${1}"
# Source HPC DME API entry point
dm_register_directory -d -s -t 2 -e <(echo '**.metadata.json') upload "${3}"
echo "Exit status of dryrun: $?"
)
}
function main(){
# Parses cli args using argparse.bash and input files to extract metadata for upload
# Initializes upload directory representing DME heirarchy
# Generates collection and dataobject metadata based on inputs
# @INPUT "$@" = command-line arguments
# @CALLS require(), parser(), init(), lint(), parse(), QC(), fingerprint()
# @EXPORTED ARGPARSE VARIABLES:
# $INPUT_DIRECTORY = Input Directory
# $OUTPUT_VAULT = Output DME Vault
# $REQUEST_TEMPLATE = Request Template
# $MULTIQC_DIRECTORY = MultiQC HOME
# $PROJECT_ID = Project ID
# $DRY_RUN = Dry run workflow
# $LOCAL_RUN = Upload data locally
# $DME_REPO = Path to DME git install
# Check system dependencies are installed
require git jq python/3.7
# Check for version flag
case ${1:-} in
--version) echo "$(basename "$0") $VERSION" && exit 0;;
esac
# pyrkit home directory or installation location
repohome=$(abspath "$(dirname "$0")")
# Enable argparse parsing within bash
source "$(dirname "$0")/src/argparse.bash" || \
fatal "Fatal: Failed to locate argparse.bash in ${repohome}!"
# Parse command-line arguments with argparse
if [ $# -eq 0 ]; then parser -h; fi # Display usage, user did not provide any args
parser "${@}"
# Base directory for all intermediate output files
output="${INPUT_DIRECTORY%/}/DME"
# Set Defaults for Optional arguments
PROJECT_ID="${PROJECT_ID:-}"
# Check that user has DME CLU toolkit installed
export HPC_DM_UTILS="${DME_REPO%/}/utils"
source "${HPC_DM_UTILS}/functions"
require "dm_register_directory"
# Initialize output diretory, lint template, parse logs, and aggregate QC information
init "${output}"
lint "${repohome}/src/lint.py" "${REQUEST_TEMPLATE}" "${output}"
parse "${INPUT_DIRECTORY%/}" "${MULTIQC_DIRECTORY%/}"
QC "${repohome}/src/pyparser.py" "${MULTIQC_DIRECTORY%/}"
# Generate unique and determinstic Analysis ID based on User Inputs
local inputs_md5 analysis_id
local assembly_name gtf_ver
IFS=$'\t' read -r inputs_md5 analysis_id assembly_name gtf_ver < <(fingerprint "${INPUT_DIRECTORY%/}" "${output}")
# Initializes local filesystem mock DME hierarchy
# PI-, Project-, Analysis-, Sample-level directories are created along with metadata
analysis_home=$(collections "${repohome}/src/initialize.py" "${output}" "${OUTPUT_VAULT%/}" "${MULTIQC_DIRECTORY%/}" "${PROJECT_ID}")
dme_analysis_home=$(echo "$analysis_home" | sed "s@^upload@${OUTPUT_VAULT%/}@")
# Creates symlinks for sample-level collections in DME
links "${INPUT_DIRECTORY%/}" "${output}" "${OUTPUT_VAULT%/}" "${repohome}/src/meta" \
"${assembly_name}" "${gtf_ver}" "${analysis_id}" "${inputs_md5}" "${dme_analysis_home}"
# Prepares multi-sample results or files for upload into Primary Analysis collection
multi "${INPUT_DIRECTORY%/}" "${output}/${analysis_home}" "${MULTIQC_DIRECTORY%/}" "${REQUEST_TEMPLATE}" \
"${repohome}/src/meta" "$dme_analysis_home" "${inputs_md5}"
# Validate that the collections are not yet in DME and if so what are the metadata upload that will take place
if [ "$VALIDATE" = "yes" ]; then
validate "${repohome}/src/validate.py" "${output}/upload" "/${OUTPUT_VAULT#/}"
fi
# # Dry-run dm_register_directory command
dryrun "${output}" "${DME_REPO%/}" "/${OUTPUT_VAULT#/}"
# Push to HPC DME if --dry-run option NOT provided
if [ "$DRY_RUN" = "no" ]; then
if [ "$LOCAL_RUN" = "yes" ]; then
cd ${output}
echo "Uploading data locally to DME"
echo $PWD
dm_register_directory -s -t 2 -e <(echo '**.metadata.json') upload "/${OUTPUT_VAULT#/}"
else
jobid=$(sbatch -J "pyrkit" --mem=24g --cpus-per-task=4 --time=24:00:00 \
"${repohome}/src/submit.sh" "${output}" "${DME_REPO%/}" "/${OUTPUT_VAULT#/}")
echo "Submiting Job ${jobid} to push data into DME"
fi
fi
}
# Main: check usage, parse args, extract metadata from inputs and generate upload heirarchy
main "$@"