Skip to content

Commit

Permalink
gCNV WDLs for WGS
Browse files Browse the repository at this point in the history
  • Loading branch information
mwalker174 committed May 15, 2020
1 parent 88cb338 commit f146f66
Show file tree
Hide file tree
Showing 4 changed files with 292 additions and 122 deletions.
246 changes: 188 additions & 58 deletions scripts/cnv_wdl/cnv_common_tasks.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ task CollectCounts {
File ref_fasta
File ref_fasta_fai
File ref_fasta_dict
Array[String]? disabled_read_filters
Boolean? enable_indexing
String? format
File? gatk4_jar_override
Expand All @@ -201,10 +202,27 @@ task CollectCounts {
Int? preemptible_attempts
}

parameter_meta {
bam: {
localization_optional: true
}
bam_idx: {
localization_optional: true
}
}

Int machine_mem_mb = select_first([mem_gb, 7]) * 1000
Int command_mem_mb = machine_mem_mb - 1000

Boolean enable_indexing_ = select_first([enable_indexing, false])
Array[String] disabled_read_filters_arr = if(defined(disabled_read_filters))
then
prefix(
"--disable-read-filter ",
select_first([disabled_read_filters])
)
else
[]

# Sample name is derived from the bam filename
String base_filename = basename(bam, ".bam")
Expand Down Expand Up @@ -257,7 +275,8 @@ task CollectCounts {
--reference ~{ref_fasta} \
--format ~{default="HDF5" hdf5_or_tsv_or_null_format} \
--interval-merging-rule OVERLAPPING_ONLY \
--output ~{counts_filename_for_collect_read_counts}
--output ~{counts_filename_for_collect_read_counts} \
~{sep=' ' disabled_read_filters_arr}

if [ ~{do_block_compression} = "true" ]; then
bgzip ~{counts_filename_for_collect_read_counts}
Expand Down Expand Up @@ -303,6 +322,15 @@ task CollectAllelicCounts {
Int? preemptible_attempts
}

parameter_meta {
bam: {
localization_optional: true
}
bam_idx: {
localization_optional: true
}
}

Int machine_mem_mb = select_first([mem_gb, 13]) * 1000
Int command_mem_mb = machine_mem_mb - 1000

Expand Down Expand Up @@ -413,99 +441,82 @@ task ScatterIntervals {
}
}

task PostprocessGermlineCNVCalls {
task BundledPostprocessGermlineCNVCalls {
input {
String entity_id
Array[File] gcnv_calls_tars
Array[File] gcnv_model_tars
Array[File] calling_configs
Array[File] denoising_configs
Array[File] gcnvkernel_version
Array[File] sharded_interval_lists
File contig_ploidy_calls_tar
Array[String]? allosomal_contigs
Int ref_copy_number_autosomal_contigs
Int sample_index
File? gatk4_jar_override

# Runtime parameters
String gatk_docker
Int? mem_gb
Int? disk_space_gb
Boolean use_ssd = false
Int? cpu
Int? preemptible_attempts
File invariants_tar
String entity_id
File contig_ploidy_calls_tar
Array[String]? allosomal_contigs
Int ref_copy_number_autosomal_contigs
Int sample_index
File? gatk4_jar_override

# Runtime parameters
String gatk_docker
Int? mem_gb
Int? disk_space_gb
Boolean use_ssd = false
Int? cpu
Int? preemptible_attempts
}

Int machine_mem_mb = select_first([mem_gb, 7]) * 1000
Int command_mem_mb = machine_mem_mb - 1000

Float invariants_size = size(invariants_tar, "GiB")
Float disk_overhead = 20.0
Float tar_disk_factor= 5.0
Int vm_disk_size = ceil(tar_disk_factor * invariants_size + disk_overhead)

String genotyped_intervals_vcf_filename = "genotyped-intervals-~{entity_id}.vcf.gz"
String genotyped_segments_vcf_filename = "genotyped-segments-~{entity_id}.vcf.gz"
String denoised_copy_ratios_filename = "denoised_copy_ratios-~{entity_id}.tsv"

Array[String] allosomal_contigs_args = if defined(allosomal_contigs) then prefix("--allosomal-contig ", select_first([allosomal_contigs])) else []

command <<<
set -eu
export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override}
set -euo pipefail

sharded_interval_lists_array=(~{sep=" " sharded_interval_lists})
export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override}

# untar calls to CALLS_0, CALLS_1, etc directories and build the command line
# also copy over shard config and interval files
gcnv_calls_tar_array=(~{sep=" " gcnv_calls_tars})
calling_configs_array=(~{sep=" " calling_configs})
denoising_configs_array=(~{sep=" " denoising_configs})
gcnvkernel_version_array=(~{sep=" " gcnvkernel_version})
sharded_interval_lists_array=(~{sep=" " sharded_interval_lists})
calls_args=""
for index in ${!gcnv_calls_tar_array[@]}; do
gcnv_calls_tar=${gcnv_calls_tar_array[$index]}
mkdir -p CALLS_$index/SAMPLE_~{sample_index}
tar xzf $gcnv_calls_tar -C CALLS_$index/SAMPLE_~{sample_index}
cp ${calling_configs_array[$index]} CALLS_$index/
cp ${denoising_configs_array[$index]} CALLS_$index/
cp ${gcnvkernel_version_array[$index]} CALLS_$index/
cp ${sharded_interval_lists_array[$index]} CALLS_$index/
calls_args="$calls_args --calls-shard-path CALLS_$index"
done

# untar models to MODEL_0, MODEL_1, etc directories and build the command line
gcnv_model_tar_array=(~{sep=" " gcnv_model_tars})
model_args=""
for index in ${!gcnv_model_tar_array[@]}; do
gcnv_model_tar=${gcnv_model_tar_array[$index]}
mkdir MODEL_$index
tar xzf $gcnv_model_tar -C MODEL_$index
model_args="$model_args --model-shard-path MODEL_$index"
tar xzf ~{invariants_tar}
rm ~{invariants_tar}
number_of_shards=`find . -name 'CALLS_*' | wc -l`

touch calls_and_model_args.txt
for i in $(seq 0 `expr $number_of_shards - 1`); do
echo "--calls-shard-path CALLS_$i" >> calls_and_model_args.txt
echo "--model-shard-path MODEL_$i" >> calls_and_model_args.txt
done

mkdir contig-ploidy-calls
tar xzf ~{contig_ploidy_calls_tar} -C contig-ploidy-calls
mkdir -p extracted-contig-ploidy-calls
tar xzf ~{contig_ploidy_calls_tar} -C extracted-contig-ploidy-calls
rm ~{contig_ploidy_calls_tar}

gatk --java-options "-Xmx~{command_mem_mb}m" PostprocessGermlineCNVCalls \
$calls_args \
$model_args \
time gatk --java-options "-Xmx~{command_mem_mb}m" PostprocessGermlineCNVCalls \
--arguments_file calls_and_model_args.txt \
~{sep=" " allosomal_contigs_args} \
--autosomal-ref-copy-number ~{ref_copy_number_autosomal_contigs} \
--contig-ploidy-calls contig-ploidy-calls \
--contig-ploidy-calls extracted-contig-ploidy-calls \
--sample-index ~{sample_index} \
--output-genotyped-intervals ~{genotyped_intervals_vcf_filename} \
--output-genotyped-segments ~{genotyped_segments_vcf_filename} \
--output-denoised-copy-ratios ~{denoised_copy_ratios_filename}

rm -rf CALLS_*
rm -rf MODEL_*
rm -rf contig-ploidy-calls
rm -rf extracted-contig-ploidy-calls
>>>

runtime {
docker: gatk_docker
memory: machine_mem_mb + " MB"
disks: "local-disk " + select_first([disk_space_gb, 40]) + if use_ssd then " SSD" else " HDD"
disks: "local-disk " + select_first([disk_space_gb, vm_disk_size]) + if use_ssd then " SSD" else " HDD"
cpu: select_first([cpu, 1])
preemptible: select_first([preemptible_attempts, 5])
maxRetries: 1
}

output {
Expand Down Expand Up @@ -605,3 +616,122 @@ task CollectModelQualityMetrics {
String qc_status_string = read_string("qcStatus.txt")
}
}

task BundlePostprocessingInvariants {
input {
Array[File] calls_tars
Array[File] model_tars
Array[File] calling_configs
Array[File] denoising_configs
Array[File] gcnvkernel_version
Array[File] sharded_interval_lists

# Runtime parameters
String docker
Int? mem_gb
Int? disk_space_gb
Boolean use_ssd = false
Int? cpu
Int? preemptible_attempts
}

command <<<
set -euo pipefail
mkdir -p out

calls_files_tar_list=~{write_lines(calls_tars)}
model_files_tar_list=~{write_lines(model_tars)}

calling_configs_list=~{write_lines(calling_configs)}
denoising_configs_list=~{write_lines(denoising_configs)}
gcnvkernel_version_list=~{write_lines(gcnvkernel_version)}
sharded_interval_lists_list=~{write_lines(sharded_interval_lists)}

cat $calls_files_tar_list | sort -V > calls_files_tar_list.sorted
cat $model_files_tar_list | sort -V > model_files_tar_list.sorted

cat $calling_configs_list | sort -V > calling_configs_list.sorted
cat $denoising_configs_list | sort -V > denoising_configs_list.sorted
cat $gcnvkernel_version_list | sort -V > gcnvkernel_version_list.sorted
cat $sharded_interval_lists_list | sort -V > sharded_interval_lists_list.sorted

paste calls_files_tar_list.sorted model_files_tar_list.sorted calling_configs_list.sorted denoising_configs_list.sorted gcnvkernel_version_list.sorted sharded_interval_lists_list.sorted |\
awk '{print (NR-1)"\t"$0}' > file_sets.sorted
OIFS=$IFS
IFS=$'\t'
while read index calls_tar model_tar call_config denoise version intervals; do
mkdir -p out/CALLS_$index
mkdir -p out/MODEL_$index
tar xzf $calls_tar -C out/CALLS_$index
tar xzf $model_tar -C out/MODEL_$index
cp $call_config out/CALLS_$index
cp $denoise out/CALLS_$index
cp $version out/CALLS_$index
cp $intervals out/CALLS_$index
rm $calls_tar $model_tar $call_config $denoise $version $intervals

done < file_sets.sorted
IFS=$OIFS

tar c -C out . | gzip -1 > case-gcnv-postprocessing-invariants.tar.gz
rm -Rf out
>>>

runtime {
docker: docker
memory: select_first([mem_gb, 2]) + " GiB"
disks: "local-disk " + select_first([disk_space_gb, 150]) + if use_ssd then " SSD" else " HDD"
cpu: select_first([cpu, 1])
preemptible: select_first([preemptible_attempts, 5])
}

output {
File bundle_tar = "case-gcnv-postprocessing-invariants.tar.gz"
}
}

task ScatterPloidyCallsBySample {
input {
File contig_ploidy_calls_tar
Array[String] samples

# Runtime parameters
String docker
Int? mem_gb
Int? disk_space_gb
Boolean use_ssd = false
Int? cpu
Int? preemptible_attempts
}

Int num_samples = length(samples)
String out_dir = "calls_renamed"

command <<<
set -eu

# Extract ploidy calls
mkdir calls
tar xzf ~{contig_ploidy_calls_tar} -C calls/

# Archive call files by sample, renaming so they will be glob'd in order
sample_ids=(~{sep=" " samples})
for (( i=0; i<~{num_samples}; i++ ))
do
sample_id=${sample_ids[$i]}
sample_no=`printf %04d $i`
tar -czf sample_${sample_no}.${sample_id}.contig_ploidy_calls.tar.gz -C calls/SAMPLE_${i} .
done
>>>
runtime {
docker: docker
memory: select_first([mem_gb, 2]) + " GiB"
disks: "local-disk " + select_first([disk_space_gb, 10]) + if use_ssd then " SSD" else " HDD"
cpu: select_first([cpu, 1])
preemptible: select_first([preemptible_attempts, 5])
}

output {
Array[File] sample_contig_ploidy_calls_tar = glob("sample_*.contig_ploidy_calls.tar.gz")
}
}
22 changes: 12 additions & 10 deletions scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ workflow CNVGermlineCaseScatteredWorkflow {
##############################################
#### optional arguments for CollectCounts ####
##############################################
Array[String]? disabled_read_filters_for_collect_counts
String? collect_counts_format
Boolean? collect_counts_enable_indexing
Int? mem_gb_for_collect_counts
Expand Down Expand Up @@ -149,6 +150,7 @@ workflow CNVGermlineCaseScatteredWorkflow {
preemptible_attempts = preemptible_attempts,
padding = padding,
bin_length = bin_length,
disabled_read_filters_for_collect_counts = disabled_read_filters_for_collect_counts,
collect_counts_format = collect_counts_format,
collect_counts_enable_indexing = collect_counts_enable_indexing,
mem_gb_for_collect_counts = mem_gb_for_collect_counts,
Expand Down Expand Up @@ -196,16 +198,16 @@ workflow CNVGermlineCaseScatteredWorkflow {

output {
Array[File] preprocessed_intervals = CNVGermlineCaseWorkflow.preprocessed_intervals
Array[Array[File]] read_counts_entity_id = CNVGermlineCaseWorkflow.read_counts_entity_id
Array[Array[File]] read_counts = CNVGermlineCaseWorkflow.read_counts
Array[File] contig_ploidy_calls_tars = CNVGermlineCaseWorkflow.contig_ploidy_calls_tar
Array[Array[Array[File]]] gcnv_calls_tars = CNVGermlineCaseWorkflow.gcnv_calls_tars
Array[Array[File]] gcnv_tracking_tars = CNVGermlineCaseWorkflow.gcnv_tracking_tars
Array[Array[File]] genotyped_intervals_vcf = CNVGermlineCaseWorkflow.genotyped_intervals_vcf
Array[Array[File]] genotyped_segments_vcf = CNVGermlineCaseWorkflow.genotyped_segments_vcf
Array[Array[File]] qc_status_files = CNVGermlineCaseWorkflow.qc_status_files
Array[Array[String]] qc_status_strings = CNVGermlineCaseWorkflow.qc_status_strings
Array[Array[File]] denoised_copy_ratios = CNVGermlineCaseWorkflow.denoised_copy_ratios
Array[File] read_counts_entity_id = flatten(CNVGermlineCaseWorkflow.read_counts_entity_id)
Array[File] read_counts = flatten(CNVGermlineCaseWorkflow.read_counts)
Array[File] sample_contig_ploidy_calls_tars = flatten(CNVGermlineCaseWorkflow.sample_contig_ploidy_calls_tars)
Array[File] gcnv_calls_tars = flatten(CNVGermlineCaseWorkflow.gcnv_calls_tars)
Array[File] gcnv_tracking_tars = flatten(CNVGermlineCaseWorkflow.gcnv_tracking_tars)
Array[File] genotyped_intervals_vcf = flatten(CNVGermlineCaseWorkflow.genotyped_intervals_vcf)
Array[File] genotyped_segments_vcf = flatten(CNVGermlineCaseWorkflow.genotyped_segments_vcf)
Array[File] denoised_copy_ratios = flatten(CNVGermlineCaseWorkflow.denoised_copy_ratios)
Array[File] qc_status_files = flatten(CNVGermlineCaseWorkflow.qc_status_files)
Array[String] qc_status_strings = flatten(CNVGermlineCaseWorkflow.qc_status_strings)
}
}

Expand Down
Loading

0 comments on commit f146f66

Please sign in to comment.