Skip to content

Commit

Permalink
Transpose instead of bundling
Browse files Browse the repository at this point in the history
  • Loading branch information
mwalker174 committed Jun 1, 2020
1 parent c39e17d commit 3931c8d
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 88 deletions.
123 changes: 55 additions & 68 deletions scripts/cnv_wdl/cnv_common_tasks.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,12 @@ task ScatterIntervals {

task PostprocessGermlineCNVCalls {
input {
File bundled_gcnv_outputs
File gcnv_calls_sample_tar
Array[File] gcnv_model_tars
Array[File] calling_configs
Array[File] denoising_configs
Array[File] gcnvkernel_version
Array[File] sharded_interval_lists
String entity_id
File contig_ploidy_calls_tar
Array[String]? allosomal_contigs
Expand All @@ -463,11 +468,6 @@ task PostprocessGermlineCNVCalls {
Int machine_mem_mb = select_first([mem_gb, 7]) * 1000
Int command_mem_mb = machine_mem_mb - 1000

Float bundled_gcnv_outputs_size = size(bundled_gcnv_outputs, "GiB")
Float disk_overhead = 20.0
Float tar_disk_factor= 5.0
Int vm_disk_size = ceil(tar_disk_factor * bundled_gcnv_outputs_size + disk_overhead)

String genotyped_intervals_vcf_filename = "genotyped-intervals-~{entity_id}.vcf.gz"
String genotyped_segments_vcf_filename = "genotyped-segments-~{entity_id}.vcf.gz"
String denoised_copy_ratios_filename = "denoised_copy_ratios-~{entity_id}.tsv"
Expand All @@ -476,44 +476,59 @@ task PostprocessGermlineCNVCalls {

command <<<
set -euo pipefail

export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override}

sharded_interval_lists_array=(~{sep=" " sharded_interval_lists})

# untar calls to CALLS_0, CALLS_1, etc directories and build the command line
# also copy over shard config and interval files
tar xzf ~{bundled_gcnv_outputs}
rm ~{bundled_gcnv_outputs}
number_of_shards=`find . -name 'CALLS_*' | wc -l`

touch calls_and_model_args.txt
for i in $(seq 0 `expr $number_of_shards - 1`); do
echo "--calls-shard-path CALLS_$i" >> calls_and_model_args.txt
echo "--model-shard-path MODEL_$i" >> calls_and_model_args.txt
calling_configs_array=(~{sep=" " calling_configs})
denoising_configs_array=(~{sep=" " denoising_configs})
gcnvkernel_version_array=(~{sep=" " gcnvkernel_version})
sharded_interval_lists_array=(~{sep=" " sharded_interval_lists})
calls_args=""
tar xzf ~{gcnv_calls_sample_tar}
for index in ${!calling_configs_array[@]}; do
cp ${calling_configs_array[$index]} CALLS_$index/
cp ${denoising_configs_array[$index]} CALLS_$index/
cp ${gcnvkernel_version_array[$index]} CALLS_$index/
cp ${sharded_interval_lists_array[$index]} CALLS_$index/
calls_args="$calls_args --calls-shard-path CALLS_$index"
done

mkdir -p extracted-contig-ploidy-calls
tar xzf ~{contig_ploidy_calls_tar} -C extracted-contig-ploidy-calls
rm ~{contig_ploidy_calls_tar}
# untar models to MODEL_0, MODEL_1, etc directories and build the command line
gcnv_model_tar_array=(~{sep=" " gcnv_model_tars})
model_args=""
for index in ${!gcnv_model_tar_array[@]}; do
gcnv_model_tar=${gcnv_model_tar_array[$index]}
mkdir MODEL_$index
tar xzf $gcnv_model_tar -C MODEL_$index
model_args="$model_args --model-shard-path MODEL_$index"
done

mkdir contig-ploidy-calls
tar xzf ~{contig_ploidy_calls_tar} -C contig-ploidy-calls

gatk --java-options "-Xmx~{command_mem_mb}m" PostprocessGermlineCNVCalls \
--arguments_file calls_and_model_args.txt \
$calls_args \
$model_args \
~{sep=" " allosomal_contigs_args} \
--autosomal-ref-copy-number ~{ref_copy_number_autosomal_contigs} \
--contig-ploidy-calls extracted-contig-ploidy-calls \
--contig-ploidy-calls contig-ploidy-calls \
--sample-index ~{sample_index} \
--output-genotyped-intervals ~{genotyped_intervals_vcf_filename} \
--output-genotyped-segments ~{genotyped_segments_vcf_filename} \
--output-denoised-copy-ratios ~{denoised_copy_ratios_filename}

rm -rf CALLS_*
rm -rf MODEL_*
rm -rf extracted-contig-ploidy-calls
rm -rf contig-ploidy-calls
>>>

runtime {
docker: gatk_docker
memory: machine_mem_mb + " MB"
disks: "local-disk " + select_first([disk_space_gb, vm_disk_size]) + if use_ssd then " SSD" else " HDD"
disks: "local-disk " + select_first([disk_space_gb, 40]) + if use_ssd then " SSD" else " HDD"
cpu: select_first([cpu, 1])
preemptible: select_first([preemptible_attempts, 5])
maxRetries: 1
Expand Down Expand Up @@ -617,14 +632,9 @@ task CollectModelQualityMetrics {
}
}

task BundleCallerOutputs {
task TransposeCallerOutputs {
input {
Array[File] calls_tars
Array[File] model_tars
Array[File] calling_configs
Array[File] denoising_configs
Array[File] gcnvkernel_version
Array[File] sharded_interval_lists
Array[File] gcnv_calls_tars

# Runtime parameters
String docker
Expand All @@ -637,44 +647,21 @@ task BundleCallerOutputs {

command <<<
set -euo pipefail
mkdir -p out

calls_files_tar_list=~{write_lines(calls_tars)}
model_files_tar_list=~{write_lines(model_tars)}

calling_configs_list=~{write_lines(calling_configs)}
denoising_configs_list=~{write_lines(denoising_configs)}
gcnvkernel_version_list=~{write_lines(gcnvkernel_version)}
sharded_interval_lists_list=~{write_lines(sharded_interval_lists)}

cat $calls_files_tar_list | sort -V > calls_files_tar_list.sorted
cat $model_files_tar_list | sort -V > model_files_tar_list.sorted

cat $calling_configs_list | sort -V > calling_configs_list.sorted
cat $denoising_configs_list | sort -V > denoising_configs_list.sorted
cat $gcnvkernel_version_list | sort -V > gcnvkernel_version_list.sorted
cat $sharded_interval_lists_list | sort -V > sharded_interval_lists_list.sorted

paste calls_files_tar_list.sorted model_files_tar_list.sorted calling_configs_list.sorted denoising_configs_list.sorted gcnvkernel_version_list.sorted sharded_interval_lists_list.sorted |\
awk '{print (NR-1)"\t"$0}' > file_sets.sorted
OIFS=$IFS
IFS=$'\t'
while read index calls_tar model_tar call_config denoise version intervals; do
mkdir -p out/CALLS_$index
mkdir -p out/MODEL_$index
tar xzf $calls_tar -C out/CALLS_$index
tar xzf $model_tar -C out/MODEL_$index
cp $call_config out/CALLS_$index
cp $denoise out/CALLS_$index
cp $version out/CALLS_$index
cp $intervals out/CALLS_$index
rm $calls_tar $model_tar $call_config $denoise $version $intervals

done < file_sets.sorted
IFS=$OIFS

tar c -C out . | gzip -1 > case-gcnv-postprocessing-invariants.tar.gz
rm -Rf out

gcnv_calls_tar_array=(~{sep=" " gcnv_calls_tars})
for index in ${!gcnv_calls_tar_array[@]}; do
mkdir CALLS_$index
tar xzf ${gcnv_calls_tar_array[$index]} -C CALLS_$index
done

CURRENT_SAMPLE=0
NUM_SAMPLES=$(ls -d CALLS_0/SAMPLE_* | wc -l)
NUM_DIGITS=${#NUM_SAMPLES}
while [ $CURRENT_SAMPLE -lt $NUM_SAMPLES ]; do
CURRENT_SAMPLE_WITH_LEADING_ZEROS=$(printf "%0${NUM_DIGITS}d" $CURRENT_SAMPLE)
tar c CALLS_*/SAMPLE_$CURRENT_SAMPLE | gzip -1 > case-gcnv-calls-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz
let CURRENT_SAMPLE=CURRENT_SAMPLE+1
done
>>>

runtime {
Expand All @@ -686,7 +673,7 @@ task BundleCallerOutputs {
}

output {
File bundle_tar = "case-gcnv-postprocessing-invariants.tar.gz"
Array[File] gcnv_calls_sample_tars = glob("case-gcnv-calls-sample-*.tar.gz")
}
}

Expand Down
19 changes: 10 additions & 9 deletions scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -232,24 +232,25 @@ workflow CNVGermlineCaseWorkflow {
}
}

call CNVTasks.BundleCallerOutputs {
call CNVTasks.TransposeCallerOutputs {
input:
calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar,
model_tars = gcnv_model_tars,
calling_configs = GermlineCNVCallerCaseMode.calling_config_json,
denoising_configs = GermlineCNVCallerCaseMode.denoising_config_json,
gcnvkernel_version = GermlineCNVCallerCaseMode.gcnvkernel_version_json,
sharded_interval_lists = GermlineCNVCallerCaseMode.sharded_interval_list,
gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar,
docker = gatk_docker,
mem_gb = mem_gb_for_bundle_caller_outputs,
disk_space_gb = disk_space_gb_for_bundle_caller_outputs,
preemptible_attempts = preemptible_attempts
}

scatter (sample_index in range(length(normal_bams))) {

call CNVTasks.PostprocessGermlineCNVCalls {
input:
bundled_gcnv_outputs = BundleCallerOutputs.bundle_tar,
gcnv_calls_sample_tar = TransposeCallerOutputs.gcnv_calls_sample_tars[sample_index],
gcnv_model_tars = gcnv_model_tars,
calling_configs = GermlineCNVCallerCaseMode.calling_config_json,
denoising_configs = GermlineCNVCallerCaseMode.denoising_config_json,
gcnvkernel_version = GermlineCNVCallerCaseMode.gcnvkernel_version_json,
sharded_interval_lists = GermlineCNVCallerCaseMode.sharded_interval_list,
entity_id = CollectCounts.entity_id[sample_index],
allosomal_contigs = allosomal_contigs,
ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs,
Expand Down Expand Up @@ -341,7 +342,7 @@ task DetermineGermlineContigPloidyCaseMode {
--mapping-error-rate ~{default="0.01" mapping_error_rate} \
--sample-psi-scale ~{default="0.0001" sample_psi_scale}

tar czf case-contig-ploidy-calls.tar.gz -C ~{output_dir_}/case-calls .
tar c -C ~{output_dir_}/case-calls . | gzip -1 > case-contig-ploidy-calls.tar.gz

rm -rf contig-ploidy-model
>>>
Expand Down
22 changes: 11 additions & 11 deletions scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -325,14 +325,9 @@ workflow CNVGermlineCohortWorkflow {
}
}

call CNVTasks.BundleCallerOutputs {
call CNVTasks.TransposeCallerOutputs {
input:
calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar,
model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar,
calling_configs = GermlineCNVCallerCohortMode.calling_config_json,
denoising_configs = GermlineCNVCallerCohortMode.denoising_config_json,
gcnvkernel_version = GermlineCNVCallerCohortMode.gcnvkernel_version_json,
sharded_interval_lists = GermlineCNVCallerCohortMode.sharded_interval_list,
gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar,
docker = gatk_docker,
mem_gb = mem_gb_for_bundle_caller_outputs,
disk_space_gb = disk_space_gb_for_bundle_caller_outputs,
Expand All @@ -342,7 +337,12 @@ workflow CNVGermlineCohortWorkflow {
scatter (sample_index in range(length(normal_bams))) {
call CNVTasks.PostprocessGermlineCNVCalls {
input:
bundled_gcnv_outputs = BundleCallerOutputs.bundle_tar,
gcnv_calls_sample_tar = TransposeCallerOutputs.gcnv_calls_sample_tars[sample_index],
gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar,
calling_configs = GermlineCNVCallerCohortMode.calling_config_json,
denoising_configs = GermlineCNVCallerCohortMode.denoising_config_json,
gcnvkernel_version = GermlineCNVCallerCohortMode.gcnvkernel_version_json,
sharded_interval_lists = GermlineCNVCallerCohortMode.sharded_interval_list,
entity_id = CollectCounts.entity_id[sample_index],
allosomal_contigs = allosomal_contigs,
ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs,
Expand Down Expand Up @@ -597,9 +597,9 @@ task GermlineCNVCallerCohortMode {
--caller-external-admixing-rate ~{default="1.00" caller_external_admixing_rate} \
--disable-annealing ~{default="false" disable_annealing}

tar c -C ~{output_dir_}/~{cohort_entity_id}-tracking . | gzip -1 > ~{cohort_entity_id}-gcnv-tracking-shard-~{scatter_index}.tar.gz
tar c -C ~{output_dir_}/~{cohort_entity_id}-calls . | gzip -1 > ~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}.tar.gz
tar c -C ~{output_dir_}/~{cohort_entity_id}-model . | gzip -1 > ~{cohort_entity_id}-gcnv-model-shard-~{scatter_index}.tar.gz
tar czf ~{cohort_entity_id}-gcnv-tracking-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-tracking .
tar czf ~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-calls .
tar czf ~{cohort_entity_id}-gcnv-model-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-model .

rm -rf contig-ploidy-calls
>>>
Expand Down

0 comments on commit 3931c8d

Please sign in to comment.