From 70847757def8e187bd3c96cb3d9f86c86de085b4 Mon Sep 17 00:00:00 2001 From: Mark Walker Date: Fri, 10 Jul 2020 14:00:27 -0400 Subject: [PATCH] Revert back to 2d file array --- scripts/cnv_wdl/cnv_common_tasks.wdl | 110 ++++++------------ .../cnv_germline_case_scattered_workflow.wdl | 2 +- .../germline/cnv_germline_case_workflow.wdl | 41 ++++--- .../germline/cnv_germline_cohort_workflow.wdl | 45 +++---- 4 files changed, 87 insertions(+), 111 deletions(-) diff --git a/scripts/cnv_wdl/cnv_common_tasks.wdl b/scripts/cnv_wdl/cnv_common_tasks.wdl index 0f6e440ebf9..d43417a62dd 100644 --- a/scripts/cnv_wdl/cnv_common_tasks.wdl +++ b/scripts/cnv_wdl/cnv_common_tasks.wdl @@ -437,23 +437,26 @@ task ScatterIntervals { task PostprocessGermlineCNVCalls { input { - File gcnv_calls_sample_tar - Array[File] gcnv_model_tars - File gcnv_shard_configs_tar - String entity_id - File contig_ploidy_calls_tar - Array[String]? allosomal_contigs - Int ref_copy_number_autosomal_contigs - Int sample_index - File? gatk4_jar_override - - # Runtime parameters - String gatk_docker - Int? mem_gb - Int? disk_space_gb - Boolean use_ssd = false - Int? cpu - Int? preemptible_attempts + String entity_id + Array[File] gcnv_calls_tars + Array[File] gcnv_model_tars + Array[File] calling_configs + Array[File] denoising_configs + Array[File] gcnvkernel_version + Array[File] sharded_interval_lists + File contig_ploidy_calls_tar + Array[String]? allosomal_contigs + Int ref_copy_number_autosomal_contigs + Int sample_index + File? gatk4_jar_override + + # Runtime parameters + String gatk_docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? cpu + Int? preemptible_attempts } Int machine_mem_mb = select_first([mem_gb, 7]) * 1000 @@ -466,15 +469,28 @@ task PostprocessGermlineCNVCalls { Array[String] allosomal_contigs_args = if defined(allosomal_contigs) then prefix("--allosomal-contig ", select_first([allosomal_contigs])) else [] command <<< - set -euo pipefail + set -eu export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override} + sharded_interval_lists_array=(~{sep=" " sharded_interval_lists}) + # untar calls to CALLS_0, CALLS_1, etc directories and build the command line - tar xzf ~{gcnv_calls_sample_tar} - tar xzf ~{gcnv_shard_configs_tar} + # also copy over shard config and interval files + gcnv_calls_tar_array=(~{sep=" " gcnv_calls_tars}) + calling_configs_array=(~{sep=" " calling_configs}) + denoising_configs_array=(~{sep=" " denoising_configs}) + gcnvkernel_version_array=(~{sep=" " gcnvkernel_version}) + sharded_interval_lists_array=(~{sep=" " sharded_interval_lists}) calls_args="" - for calls_dir in CALLS_*; do - calls_args="$calls_args --calls-shard-path $calls_dir" + for index in ${!gcnv_calls_tar_array[@]}; do + gcnv_calls_tar=${gcnv_calls_tar_array[$index]} + mkdir -p CALLS_$index/SAMPLE_~{sample_index} + tar xzf $gcnv_calls_tar -C CALLS_$index/SAMPLE_~{sample_index} + cp ${calling_configs_array[$index]} CALLS_$index/ + cp ${denoising_configs_array[$index]} CALLS_$index/ + cp ${gcnvkernel_version_array[$index]} CALLS_$index/ + cp ${sharded_interval_lists_array[$index]} CALLS_$index/ + calls_args="$calls_args --calls-shard-path CALLS_$index" done # untar models to MODEL_0, MODEL_1, etc directories and build the command line @@ -512,7 +528,6 @@ task PostprocessGermlineCNVCalls { disks: "local-disk " + select_first([disk_space_gb, 40]) + if use_ssd then " SSD" else " HDD" cpu: select_first([cpu, 1]) preemptible: select_first([preemptible_attempts, 5]) - maxRetries: 1 } output { @@ -613,55 +628,6 @@ task CollectModelQualityMetrics { } } -task TransposeCallerOutputs { - input { - Array[File] gcnv_calls_tars - - # Runtime parameters - String docker - Int? mem_gb - Int? disk_space_gb - Boolean use_ssd = false - Int? cpu - Int? preemptible_attempts - } - - command <<< - set -euo pipefail - - gcnv_calls_tar_array=(~{sep=" " gcnv_calls_tars}) - for index in ${!gcnv_calls_tar_array[@]}; do - mkdir CALLS_$index - tar xzf ${gcnv_calls_tar_array[$index]} -C CALLS_$index - done - - CURRENT_SAMPLE=0 - NUM_SAMPLES=$(ls -d CALLS_0/SAMPLE_* | wc -l) - NUM_DIGITS=${#NUM_SAMPLES} - while [ $CURRENT_SAMPLE -lt $NUM_SAMPLES ]; do - CURRENT_SAMPLE_WITH_LEADING_ZEROS=$(printf "%0${NUM_DIGITS}d" $CURRENT_SAMPLE) - tar c CALLS_*/SAMPLE_$CURRENT_SAMPLE | gzip -1 > gcnv-calls-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz - let CURRENT_SAMPLE=CURRENT_SAMPLE+1 - done - - rm -r CALLS_*/SAMPLE_* - tar c CALLS_* | gzip -1 > gcnv-shard-configs.tar.gz - >>> - - runtime { - docker: docker - memory: select_first([mem_gb, 2]) + " GiB" - disks: "local-disk " + select_first([disk_space_gb, 150]) + if use_ssd then " SSD" else " HDD" - cpu: select_first([cpu, 1]) - preemptible: select_first([preemptible_attempts, 5]) - } - - output { - Array[File] gcnv_calls_sample_tars = glob("gcnv-calls-sample-*.tar.gz") - File gcnv_shard_configs_tar = "gcnv-shard-configs.tar.gz" - } -} - task ScatterPloidyCallsBySample { input { File contig_ploidy_calls_tar diff --git a/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl index 23cd4dcfa0f..4fd09289465 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl @@ -209,7 +209,7 @@ workflow CNVGermlineCaseScatteredWorkflow { Array[File] read_counts_entity_id = flatten(CNVGermlineCaseWorkflow.read_counts_entity_id) Array[File] read_counts = flatten(CNVGermlineCaseWorkflow.read_counts) Array[File] sample_contig_ploidy_calls_tars = flatten(CNVGermlineCaseWorkflow.sample_contig_ploidy_calls_tars) - Array[Array[File]] gcnv_calls_tars = CNVGermlineCaseWorkflow.gcnv_calls_tars + Array[Array[Array[File]]] gcnv_calls_tars = CNVGermlineCaseWorkflow.gcnv_calls_tars Array[Array[File]] gcnv_tracking_tars = CNVGermlineCaseWorkflow.gcnv_tracking_tars Array[File] genotyped_intervals_vcf = flatten(CNVGermlineCaseWorkflow.genotyped_intervals_vcf) Array[File] genotyped_segments_vcf = flatten(CNVGermlineCaseWorkflow.genotyped_segments_vcf) diff --git a/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl index f127044f2f2..0131115f9c6 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl @@ -232,32 +232,25 @@ workflow CNVGermlineCaseWorkflow { } } - call CNVTasks.TransposeCallerOutputs { - input: - gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar, - docker = gatk_docker, - mem_gb = mem_gb_for_transpose_caller_outputs, - disk_space_gb = disk_space_gb_for_transpose_caller_outputs, - preemptible_attempts = preemptible_attempts - } + Array[Array[File]] call_tars_sample_by_shard = transpose(GermlineCNVCallerCaseMode.gcnv_call_tars) scatter (sample_index in range(length(normal_bams))) { - call CNVTasks.PostprocessGermlineCNVCalls { input: - gcnv_calls_sample_tar = TransposeCallerOutputs.gcnv_calls_sample_tars[sample_index], - gcnv_model_tars = gcnv_model_tars, - gcnv_shard_configs_tar = TransposeCallerOutputs.gcnv_shard_configs_tar, entity_id = CollectCounts.entity_id[sample_index], + gcnv_calls_tars = call_tars_sample_by_shard[sample_index], + gcnv_model_tars = gcnv_model_tars, + calling_configs = GermlineCNVCallerCaseMode.calling_config_json, + denoising_configs = GermlineCNVCallerCaseMode.denoising_config_json, + gcnvkernel_version = GermlineCNVCallerCaseMode.gcnvkernel_version_json, + sharded_interval_lists = GermlineCNVCallerCaseMode.sharded_interval_list, allosomal_contigs = allosomal_contigs, ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs, contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar, sample_index = sample_index, gatk4_jar_override = gatk4_jar_override, gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - mem_gb = mem_gb_for_postprocess_germline_cnv_calls, - disk_space_gb = disk_space_gb_for_postprocess_germline_cnv_calls + preemptible_attempts = preemptible_attempts } call CNVTasks.CollectSampleQualityMetrics { @@ -283,7 +276,7 @@ workflow CNVGermlineCaseWorkflow { Array[File] read_counts_entity_id = CollectCounts.entity_id Array[File] read_counts = CollectCounts.counts Array[File] sample_contig_ploidy_calls_tars = ScatterPloidyCallsBySample.sample_contig_ploidy_calls_tar - Array[File] gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar + Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_call_tars Array[File] gcnv_tracking_tars = GermlineCNVCallerCaseMode.gcnv_tracking_tar Array[File] genotyped_intervals_vcf = PostprocessGermlineCNVCalls.genotyped_intervals_vcf Array[File] genotyped_segments_vcf = PostprocessGermlineCNVCalls.genotyped_segments_vcf @@ -465,9 +458,17 @@ task GermlineCNVCallerCaseMode { --caller-external-admixing-rate ~{default="1.00" caller_external_admixing_rate} \ --disable-annealing ~{default="false" disable_annealing} - tar czf case-gcnv-calls-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/case-calls . tar czf case-gcnv-tracking-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/case-tracking . + CURRENT_SAMPLE=0 + NUM_SAMPLES=~{num_samples} + NUM_DIGITS=${#NUM_SAMPLES} + while [ $CURRENT_SAMPLE -lt $NUM_SAMPLES ]; do + CURRENT_SAMPLE_WITH_LEADING_ZEROS=$(printf "%0${NUM_DIGITS}d" $CURRENT_SAMPLE) + tar czf case-gcnv-calls-shard-~{scatter_index}-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz -C ~{output_dir_}/case-calls/SAMPLE_$CURRENT_SAMPLE . + let CURRENT_SAMPLE=CURRENT_SAMPLE+1 + done + rm -rf contig-ploidy-calls rm -rf gcnv-model >>> @@ -481,7 +482,11 @@ task GermlineCNVCallerCaseMode { } output { - File gcnv_calls_tar = "case-gcnv-calls-shard-~{scatter_index}.tar.gz" + Array[File] gcnv_call_tars = glob("case-gcnv-calls-shard-~{scatter_index}-sample-*.tar.gz") File gcnv_tracking_tar = "case-gcnv-tracking-shard-~{scatter_index}.tar.gz" + File calling_config_json = "~{output_dir_}/case-calls/calling_config.json" + File denoising_config_json = "~{output_dir_}/case-calls/denoising_config.json" + File gcnvkernel_version_json = "~{output_dir_}/case-calls/gcnvkernel_version.json" + File sharded_interval_list = "~{output_dir_}/case-calls/interval_list.tsv" } } diff --git a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl index 3b68a555fba..aa91bed87d0 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl @@ -325,31 +325,26 @@ workflow CNVGermlineCohortWorkflow { } } - call CNVTasks.TransposeCallerOutputs { - input: - gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar, - docker = gatk_docker, - mem_gb = mem_gb_for_transpose_caller_outputs, - disk_space_gb = disk_space_gb_for_transpose_caller_outputs, - preemptible_attempts = preemptible_attempts - } - scatter (sample_index in range(length(normal_bams))) { + Array[Array[File]] call_tars_sample_by_shard = transpose(GermlineCNVCallerCohortMode.gcnv_call_tars) + + scatter (sample_index in range(length(CollectCounts.entity_id))) { call CNVTasks.PostprocessGermlineCNVCalls { input: - gcnv_calls_sample_tar = TransposeCallerOutputs.gcnv_calls_sample_tars[sample_index], - gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar, - gcnv_shard_configs_tar = TransposeCallerOutputs.gcnv_shard_configs_tar, entity_id = CollectCounts.entity_id[sample_index], + gcnv_calls_tars = call_tars_sample_by_shard[sample_index], + gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar, + calling_configs = GermlineCNVCallerCohortMode.calling_config_json, + denoising_configs = GermlineCNVCallerCohortMode.denoising_config_json, + gcnvkernel_version = GermlineCNVCallerCohortMode.gcnvkernel_version_json, + sharded_interval_lists = GermlineCNVCallerCohortMode.sharded_interval_list, + contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar, allosomal_contigs = allosomal_contigs, ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs, - contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar, sample_index = sample_index, gatk4_jar_override = gatk4_jar_override, gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - mem_gb = mem_gb_for_postprocess_germline_cnv_calls, - disk_space_gb = disk_space_gb_for_postprocess_germline_cnv_calls + preemptible_attempts = preemptible_attempts } call CNVTasks.CollectSampleQualityMetrics { @@ -386,7 +381,7 @@ workflow CNVGermlineCohortWorkflow { File contig_ploidy_model_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_model_tar Array[File] sample_contig_ploidy_calls_tars = ScatterPloidyCallsBySample.sample_contig_ploidy_calls_tar Array[File] gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar - Array[File] gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar + Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_call_tars Array[File] gcnv_tracking_tars = GermlineCNVCallerCohortMode.gcnv_tracking_tar Array[File] genotyped_intervals_vcfs = PostprocessGermlineCNVCalls.genotyped_intervals_vcf Array[File] genotyped_segments_vcfs = PostprocessGermlineCNVCalls.genotyped_segments_vcf @@ -537,6 +532,8 @@ task GermlineCNVCallerCohortMode { String output_dir_ = select_first([output_dir, "out"]) Int num_samples = length(read_count_files) + String dollar = "$" #WDL workaround, see https://github.com/broadinstitute/cromwell/issues/1819 + command <<< set -eu export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override} @@ -594,9 +591,17 @@ task GermlineCNVCallerCohortMode { --caller-external-admixing-rate ~{default="1.00" caller_external_admixing_rate} \ --disable-annealing ~{default="false" disable_annealing} + tar czf ~{cohort_entity_id}-gcnv-model-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-model . tar czf ~{cohort_entity_id}-gcnv-tracking-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-tracking . - tar czf ~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-calls . - tar czf ~{cohort_entity_id}-gcnv-model-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-model . + + CURRENT_SAMPLE=0 + NUM_SAMPLES=~{num_samples} + NUM_DIGITS=${#NUM_SAMPLES} + while [ $CURRENT_SAMPLE -lt $NUM_SAMPLES ]; do + CURRENT_SAMPLE_WITH_LEADING_ZEROS=$(printf "%0${NUM_DIGITS}d" $CURRENT_SAMPLE) + tar czf ~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-calls/SAMPLE_$CURRENT_SAMPLE . + let CURRENT_SAMPLE=CURRENT_SAMPLE+1 + done rm -rf contig-ploidy-calls >>> @@ -611,7 +616,7 @@ task GermlineCNVCallerCohortMode { output { File gcnv_model_tar = "~{cohort_entity_id}-gcnv-model-shard-~{scatter_index}.tar.gz" - File gcnv_calls_tar = "~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}.tar.gz" + Array[File] gcnv_call_tars = glob("~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}-sample-*.tar.gz") File gcnv_tracking_tar = "~{cohort_entity_id}-gcnv-tracking-shard-~{scatter_index}.tar.gz" File calling_config_json = "~{output_dir_}/~{cohort_entity_id}-calls/calling_config.json" File denoising_config_json = "~{output_dir_}/~{cohort_entity_id}-calls/denoising_config.json"