From eb99158cc87ca837ccf7ecca023893626dcc8a5e Mon Sep 17 00:00:00 2001 From: Mark Walker Date: Mon, 13 Jan 2020 18:17:28 -0500 Subject: [PATCH 01/10] gCNV WDLs for WGS --- scripts/cnv_wdl/cnv_common_tasks.wdl | 246 +++++++++++++----- .../cnv_germline_case_scattered_workflow.wdl | 22 +- .../germline/cnv_germline_case_workflow.wdl | 69 +++-- .../germline/cnv_germline_cohort_workflow.wdl | 77 +++--- 4 files changed, 292 insertions(+), 122 deletions(-) diff --git a/scripts/cnv_wdl/cnv_common_tasks.wdl b/scripts/cnv_wdl/cnv_common_tasks.wdl index a4fe6b08237..85835f01061 100644 --- a/scripts/cnv_wdl/cnv_common_tasks.wdl +++ b/scripts/cnv_wdl/cnv_common_tasks.wdl @@ -188,6 +188,7 @@ task CollectCounts { File ref_fasta File ref_fasta_fai File ref_fasta_dict + Array[String]? disabled_read_filters Boolean? enable_indexing String? format File? gatk4_jar_override @@ -201,10 +202,27 @@ task CollectCounts { Int? preemptible_attempts } + parameter_meta { + bam: { + localization_optional: true + } + bam_idx: { + localization_optional: true + } + } + Int machine_mem_mb = select_first([mem_gb, 7]) * 1000 Int command_mem_mb = machine_mem_mb - 1000 Boolean enable_indexing_ = select_first([enable_indexing, false]) + Array[String] disabled_read_filters_arr = if(defined(disabled_read_filters)) + then + prefix( + "--disable-read-filter ", + select_first([disabled_read_filters]) + ) + else + [] # Sample name is derived from the bam filename String base_filename = basename(bam, ".bam") @@ -257,7 +275,8 @@ task CollectCounts { --reference ~{ref_fasta} \ --format ~{default="HDF5" hdf5_or_tsv_or_null_format} \ --interval-merging-rule OVERLAPPING_ONLY \ - --output ~{counts_filename_for_collect_read_counts} + --output ~{counts_filename_for_collect_read_counts} \ + ~{sep=' ' disabled_read_filters_arr} if [ ~{do_block_compression} = "true" ]; then bgzip ~{counts_filename_for_collect_read_counts} @@ -303,6 +322,15 @@ task CollectAllelicCounts { Int? preemptible_attempts } + parameter_meta { + bam: { + localization_optional: true + } + bam_idx: { + localization_optional: true + } + } + Int machine_mem_mb = select_first([mem_gb, 13]) * 1000 Int command_mem_mb = machine_mem_mb - 1000 @@ -413,33 +441,33 @@ task ScatterIntervals { } } -task PostprocessGermlineCNVCalls { +task BundledPostprocessGermlineCNVCalls { input { - String entity_id - Array[File] gcnv_calls_tars - Array[File] gcnv_model_tars - Array[File] calling_configs - Array[File] denoising_configs - Array[File] gcnvkernel_version - Array[File] sharded_interval_lists - File contig_ploidy_calls_tar - Array[String]? allosomal_contigs - Int ref_copy_number_autosomal_contigs - Int sample_index - File? gatk4_jar_override - - # Runtime parameters - String gatk_docker - Int? mem_gb - Int? disk_space_gb - Boolean use_ssd = false - Int? cpu - Int? preemptible_attempts + File invariants_tar + String entity_id + File contig_ploidy_calls_tar + Array[String]? allosomal_contigs + Int ref_copy_number_autosomal_contigs + Int sample_index + File? gatk4_jar_override + + # Runtime parameters + String gatk_docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? cpu + Int? preemptible_attempts } Int machine_mem_mb = select_first([mem_gb, 7]) * 1000 Int command_mem_mb = machine_mem_mb - 1000 + Float invariants_size = size(invariants_tar, "GiB") + Float disk_overhead = 20.0 + Float tar_disk_factor= 5.0 + Int vm_disk_size = ceil(tar_disk_factor * invariants_size + disk_overhead) + String genotyped_intervals_vcf_filename = "genotyped-intervals-~{entity_id}.vcf.gz" String genotyped_segments_vcf_filename = "genotyped-segments-~{entity_id}.vcf.gz" String denoised_copy_ratios_filename = "denoised_copy_ratios-~{entity_id}.tsv" @@ -447,49 +475,31 @@ task PostprocessGermlineCNVCalls { Array[String] allosomal_contigs_args = if defined(allosomal_contigs) then prefix("--allosomal-contig ", select_first([allosomal_contigs])) else [] command <<< - set -eu - export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override} + set -euo pipefail - sharded_interval_lists_array=(~{sep=" " sharded_interval_lists}) + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override} # untar calls to CALLS_0, CALLS_1, etc directories and build the command line # also copy over shard config and interval files - gcnv_calls_tar_array=(~{sep=" " gcnv_calls_tars}) - calling_configs_array=(~{sep=" " calling_configs}) - denoising_configs_array=(~{sep=" " denoising_configs}) - gcnvkernel_version_array=(~{sep=" " gcnvkernel_version}) - sharded_interval_lists_array=(~{sep=" " sharded_interval_lists}) - calls_args="" - for index in ${!gcnv_calls_tar_array[@]}; do - gcnv_calls_tar=${gcnv_calls_tar_array[$index]} - mkdir -p CALLS_$index/SAMPLE_~{sample_index} - tar xzf $gcnv_calls_tar -C CALLS_$index/SAMPLE_~{sample_index} - cp ${calling_configs_array[$index]} CALLS_$index/ - cp ${denoising_configs_array[$index]} CALLS_$index/ - cp ${gcnvkernel_version_array[$index]} CALLS_$index/ - cp ${sharded_interval_lists_array[$index]} CALLS_$index/ - calls_args="$calls_args --calls-shard-path CALLS_$index" - done - - # untar models to MODEL_0, MODEL_1, etc directories and build the command line - gcnv_model_tar_array=(~{sep=" " gcnv_model_tars}) - model_args="" - for index in ${!gcnv_model_tar_array[@]}; do - gcnv_model_tar=${gcnv_model_tar_array[$index]} - mkdir MODEL_$index - tar xzf $gcnv_model_tar -C MODEL_$index - model_args="$model_args --model-shard-path MODEL_$index" + tar xzf ~{invariants_tar} + rm ~{invariants_tar} + number_of_shards=`find . -name 'CALLS_*' | wc -l` + + touch calls_and_model_args.txt + for i in $(seq 0 `expr $number_of_shards - 1`); do + echo "--calls-shard-path CALLS_$i" >> calls_and_model_args.txt + echo "--model-shard-path MODEL_$i" >> calls_and_model_args.txt done - mkdir contig-ploidy-calls - tar xzf ~{contig_ploidy_calls_tar} -C contig-ploidy-calls + mkdir -p extracted-contig-ploidy-calls + tar xzf ~{contig_ploidy_calls_tar} -C extracted-contig-ploidy-calls + rm ~{contig_ploidy_calls_tar} - gatk --java-options "-Xmx~{command_mem_mb}m" PostprocessGermlineCNVCalls \ - $calls_args \ - $model_args \ + time gatk --java-options "-Xmx~{command_mem_mb}m" PostprocessGermlineCNVCalls \ + --arguments_file calls_and_model_args.txt \ ~{sep=" " allosomal_contigs_args} \ --autosomal-ref-copy-number ~{ref_copy_number_autosomal_contigs} \ - --contig-ploidy-calls contig-ploidy-calls \ + --contig-ploidy-calls extracted-contig-ploidy-calls \ --sample-index ~{sample_index} \ --output-genotyped-intervals ~{genotyped_intervals_vcf_filename} \ --output-genotyped-segments ~{genotyped_segments_vcf_filename} \ @@ -497,15 +507,16 @@ task PostprocessGermlineCNVCalls { rm -rf CALLS_* rm -rf MODEL_* - rm -rf contig-ploidy-calls + rm -rf extracted-contig-ploidy-calls >>> runtime { docker: gatk_docker memory: machine_mem_mb + " MB" - disks: "local-disk " + select_first([disk_space_gb, 40]) + if use_ssd then " SSD" else " HDD" + disks: "local-disk " + select_first([disk_space_gb, vm_disk_size]) + if use_ssd then " SSD" else " HDD" cpu: select_first([cpu, 1]) preemptible: select_first([preemptible_attempts, 5]) + maxRetries: 1 } output { @@ -605,3 +616,122 @@ task CollectModelQualityMetrics { String qc_status_string = read_string("qcStatus.txt") } } + +task BundlePostprocessingInvariants { + input { + Array[File] calls_tars + Array[File] model_tars + Array[File] calling_configs + Array[File] denoising_configs + Array[File] gcnvkernel_version + Array[File] sharded_interval_lists + + # Runtime parameters + String docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? cpu + Int? preemptible_attempts + } + + command <<< + set -euo pipefail + mkdir -p out + + calls_files_tar_list=~{write_lines(calls_tars)} + model_files_tar_list=~{write_lines(model_tars)} + + calling_configs_list=~{write_lines(calling_configs)} + denoising_configs_list=~{write_lines(denoising_configs)} + gcnvkernel_version_list=~{write_lines(gcnvkernel_version)} + sharded_interval_lists_list=~{write_lines(sharded_interval_lists)} + + cat $calls_files_tar_list | sort -V > calls_files_tar_list.sorted + cat $model_files_tar_list | sort -V > model_files_tar_list.sorted + + cat $calling_configs_list | sort -V > calling_configs_list.sorted + cat $denoising_configs_list | sort -V > denoising_configs_list.sorted + cat $gcnvkernel_version_list | sort -V > gcnvkernel_version_list.sorted + cat $sharded_interval_lists_list | sort -V > sharded_interval_lists_list.sorted + + paste calls_files_tar_list.sorted model_files_tar_list.sorted calling_configs_list.sorted denoising_configs_list.sorted gcnvkernel_version_list.sorted sharded_interval_lists_list.sorted |\ + awk '{print (NR-1)"\t"$0}' > file_sets.sorted + OIFS=$IFS + IFS=$'\t' + while read index calls_tar model_tar call_config denoise version intervals; do + mkdir -p out/CALLS_$index + mkdir -p out/MODEL_$index + tar xzf $calls_tar -C out/CALLS_$index + tar xzf $model_tar -C out/MODEL_$index + cp $call_config out/CALLS_$index + cp $denoise out/CALLS_$index + cp $version out/CALLS_$index + cp $intervals out/CALLS_$index + rm $calls_tar $model_tar $call_config $denoise $version $intervals + + done < file_sets.sorted + IFS=$OIFS + + tar c -C out . | gzip -1 > case-gcnv-postprocessing-invariants.tar.gz + rm -Rf out + >>> + + runtime { + docker: docker + memory: select_first([mem_gb, 2]) + " GiB" + disks: "local-disk " + select_first([disk_space_gb, 150]) + if use_ssd then " SSD" else " HDD" + cpu: select_first([cpu, 1]) + preemptible: select_first([preemptible_attempts, 5]) + } + + output { + File bundle_tar = "case-gcnv-postprocessing-invariants.tar.gz" + } +} + +task ScatterPloidyCallsBySample { + input { + File contig_ploidy_calls_tar + Array[String] samples + + # Runtime parameters + String docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? cpu + Int? preemptible_attempts + } + + Int num_samples = length(samples) + String out_dir = "calls_renamed" + + command <<< + set -eu + + # Extract ploidy calls + mkdir calls + tar xzf ~{contig_ploidy_calls_tar} -C calls/ + + # Archive call files by sample, renaming so they will be glob'd in order + sample_ids=(~{sep=" " samples}) + for (( i=0; i<~{num_samples}; i++ )) + do + sample_id=${sample_ids[$i]} + sample_no=`printf %04d $i` + tar -czf sample_${sample_no}.${sample_id}.contig_ploidy_calls.tar.gz -C calls/SAMPLE_${i} . + done + >>> + runtime { + docker: docker + memory: select_first([mem_gb, 2]) + " GiB" + disks: "local-disk " + select_first([disk_space_gb, 10]) + if use_ssd then " SSD" else " HDD" + cpu: select_first([cpu, 1]) + preemptible: select_first([preemptible_attempts, 5]) + } + + output { + Array[File] sample_contig_ploidy_calls_tar = glob("sample_*.contig_ploidy_calls.tar.gz") + } +} \ No newline at end of file diff --git a/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl index 24634c4c369..ec1c8c6636b 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl @@ -49,6 +49,7 @@ workflow CNVGermlineCaseScatteredWorkflow { ############################################## #### optional arguments for CollectCounts #### ############################################## + Array[String]? disabled_read_filters_for_collect_counts String? collect_counts_format Boolean? collect_counts_enable_indexing Int? mem_gb_for_collect_counts @@ -149,6 +150,7 @@ workflow CNVGermlineCaseScatteredWorkflow { preemptible_attempts = preemptible_attempts, padding = padding, bin_length = bin_length, + disabled_read_filters_for_collect_counts = disabled_read_filters_for_collect_counts, collect_counts_format = collect_counts_format, collect_counts_enable_indexing = collect_counts_enable_indexing, mem_gb_for_collect_counts = mem_gb_for_collect_counts, @@ -196,16 +198,16 @@ workflow CNVGermlineCaseScatteredWorkflow { output { Array[File] preprocessed_intervals = CNVGermlineCaseWorkflow.preprocessed_intervals - Array[Array[File]] read_counts_entity_id = CNVGermlineCaseWorkflow.read_counts_entity_id - Array[Array[File]] read_counts = CNVGermlineCaseWorkflow.read_counts - Array[File] contig_ploidy_calls_tars = CNVGermlineCaseWorkflow.contig_ploidy_calls_tar - Array[Array[Array[File]]] gcnv_calls_tars = CNVGermlineCaseWorkflow.gcnv_calls_tars - Array[Array[File]] gcnv_tracking_tars = CNVGermlineCaseWorkflow.gcnv_tracking_tars - Array[Array[File]] genotyped_intervals_vcf = CNVGermlineCaseWorkflow.genotyped_intervals_vcf - Array[Array[File]] genotyped_segments_vcf = CNVGermlineCaseWorkflow.genotyped_segments_vcf - Array[Array[File]] qc_status_files = CNVGermlineCaseWorkflow.qc_status_files - Array[Array[String]] qc_status_strings = CNVGermlineCaseWorkflow.qc_status_strings - Array[Array[File]] denoised_copy_ratios = CNVGermlineCaseWorkflow.denoised_copy_ratios + Array[File] read_counts_entity_id = flatten(CNVGermlineCaseWorkflow.read_counts_entity_id) + Array[File] read_counts = flatten(CNVGermlineCaseWorkflow.read_counts) + Array[File] sample_contig_ploidy_calls_tars = flatten(CNVGermlineCaseWorkflow.sample_contig_ploidy_calls_tars) + Array[File] gcnv_calls_tars = flatten(CNVGermlineCaseWorkflow.gcnv_calls_tars) + Array[File] gcnv_tracking_tars = flatten(CNVGermlineCaseWorkflow.gcnv_tracking_tars) + Array[File] genotyped_intervals_vcf = flatten(CNVGermlineCaseWorkflow.genotyped_intervals_vcf) + Array[File] genotyped_segments_vcf = flatten(CNVGermlineCaseWorkflow.genotyped_segments_vcf) + Array[File] denoised_copy_ratios = flatten(CNVGermlineCaseWorkflow.denoised_copy_ratios) + Array[File] qc_status_files = flatten(CNVGermlineCaseWorkflow.qc_status_files) + Array[String] qc_status_strings = flatten(CNVGermlineCaseWorkflow.qc_status_strings) } } diff --git a/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl index 81654ba7cc3..d13cee99c10 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl @@ -59,6 +59,7 @@ workflow CNVGermlineCaseWorkflow { ############################################## #### optional arguments for CollectCounts #### ############################################## + Array[String]? disabled_read_filters_for_collect_counts String? collect_counts_format Boolean? collect_counts_enable_indexing Int? mem_gb_for_collect_counts @@ -111,11 +112,19 @@ workflow CNVGermlineCaseWorkflow { Float? gcnv_caller_external_admixing_rate Boolean? gcnv_disable_annealing + ###################################################### + #### arguments for BundlePostprocessingInvariants #### + ###################################################### + Int? mem_gb_for_bundle_postprocessing_invariants + Int? disk_space_gb_for_bundle_postprocessing_invariants + ################################################### #### arguments for PostprocessGermlineCNVCalls #### ################################################### Int ref_copy_number_autosomal_contigs Array[String]? allosomal_contigs + Int? disk_space_gb_for_postprocess_germline_cnv_calls + Int? mem_gb_for_postprocess_germline_cnv_calls ########################## #### arguments for QC #### @@ -150,6 +159,7 @@ workflow CNVGermlineCaseWorkflow { ref_fasta_dict = ref_fasta_dict, format = collect_counts_format, enable_indexing = collect_counts_enable_indexing, + disabled_read_filters = disabled_read_filters_for_collect_counts, gatk4_jar_override = gatk4_jar_override, gatk_docker = gatk_docker, mem_gb = mem_gb_for_collect_counts, @@ -222,30 +232,39 @@ workflow CNVGermlineCaseWorkflow { } } - Array[Array[File]] call_tars_sample_by_shard = transpose(GermlineCNVCallerCaseMode.gcnv_call_tars) + call CNVTasks.BundlePostprocessingInvariants { + input: + calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar, + model_tars = gcnv_model_tars, + calling_configs = GermlineCNVCallerCaseMode.calling_config_json, + denoising_configs = GermlineCNVCallerCaseMode.denoising_config_json, + gcnvkernel_version = GermlineCNVCallerCaseMode.gcnvkernel_version_json, + sharded_interval_lists = GermlineCNVCallerCaseMode.sharded_interval_list, + docker = gatk_docker, + mem_gb = mem_gb_for_bundle_postprocessing_invariants, + disk_space_gb = disk_space_gb_for_bundle_postprocessing_invariants, + preemptible_attempts = preemptible_attempts + } scatter (sample_index in range(length(normal_bams))) { - call CNVTasks.PostprocessGermlineCNVCalls { + call CNVTasks.BundledPostprocessGermlineCNVCalls { input: + invariants_tar = BundlePostprocessingInvariants.bundle_tar, entity_id = CollectCounts.entity_id[sample_index], - gcnv_calls_tars = call_tars_sample_by_shard[sample_index], - gcnv_model_tars = gcnv_model_tars, - calling_configs = GermlineCNVCallerCaseMode.calling_config_json, - denoising_configs = GermlineCNVCallerCaseMode.denoising_config_json, - gcnvkernel_version = GermlineCNVCallerCaseMode.gcnvkernel_version_json, - sharded_interval_lists = GermlineCNVCallerCaseMode.sharded_interval_list, allosomal_contigs = allosomal_contigs, ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs, contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar, sample_index = sample_index, gatk4_jar_override = gatk4_jar_override, gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts + preemptible_attempts = preemptible_attempts, + mem_gb = mem_gb_for_postprocess_germline_cnv_calls, + disk_space_gb = disk_space_gb_for_postprocess_germline_cnv_calls } call CNVTasks.CollectSampleQualityMetrics { input: - genotyped_segments_vcf = PostprocessGermlineCNVCalls.genotyped_segments_vcf, + genotyped_segments_vcf = BundledPostprocessGermlineCNVCalls.genotyped_segments_vcf, entity_id = CollectCounts.entity_id[sample_index], maximum_number_events = maximum_number_events_per_sample, gatk_docker = gatk_docker, @@ -253,18 +272,26 @@ workflow CNVGermlineCaseWorkflow { } } + call CNVTasks.ScatterPloidyCallsBySample { + input : + contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar, + samples = CollectCounts.entity_id, + docker = gatk_docker, + preemptible_attempts = preemptible_attempts + } + output { File preprocessed_intervals = PreprocessIntervals.preprocessed_intervals Array[File] read_counts_entity_id = CollectCounts.entity_id Array[File] read_counts = CollectCounts.counts - File contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar - Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_call_tars + Array[File] sample_contig_ploidy_calls_tars = ScatterPloidyCallsBySample.sample_contig_ploidy_calls_tar + Array[File] gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar Array[File] gcnv_tracking_tars = GermlineCNVCallerCaseMode.gcnv_tracking_tar - Array[File] genotyped_intervals_vcf = PostprocessGermlineCNVCalls.genotyped_intervals_vcf - Array[File] genotyped_segments_vcf = PostprocessGermlineCNVCalls.genotyped_segments_vcf + Array[File] genotyped_intervals_vcf = BundledPostprocessGermlineCNVCalls.genotyped_intervals_vcf + Array[File] genotyped_segments_vcf = BundledPostprocessGermlineCNVCalls.genotyped_segments_vcf + Array[File] denoised_copy_ratios = BundledPostprocessGermlineCNVCalls.denoised_copy_ratios Array[File] qc_status_files = CollectSampleQualityMetrics.qc_status_file Array[String] qc_status_strings = CollectSampleQualityMetrics.qc_status_string - Array[File] denoised_copy_ratios = PostprocessGermlineCNVCalls.denoised_copy_ratios } } @@ -441,15 +468,7 @@ task GermlineCNVCallerCaseMode { --disable-annealing ~{default="false" disable_annealing} tar czf case-gcnv-tracking-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/case-tracking . - - CURRENT_SAMPLE=0 - NUM_SAMPLES=~{num_samples} - NUM_DIGITS=${#NUM_SAMPLES} - while [ $CURRENT_SAMPLE -lt $NUM_SAMPLES ]; do - CURRENT_SAMPLE_WITH_LEADING_ZEROS=$(printf "%0${NUM_DIGITS}d" $CURRENT_SAMPLE) - tar czf case-gcnv-calls-shard-~{scatter_index}-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz -C ~{output_dir_}/case-calls/SAMPLE_$CURRENT_SAMPLE . - let CURRENT_SAMPLE=CURRENT_SAMPLE+1 - done + tar czf case-gcnv-calls-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/case-calls . rm -rf contig-ploidy-calls rm -rf gcnv-model @@ -464,7 +483,7 @@ task GermlineCNVCallerCaseMode { } output { - Array[File] gcnv_call_tars = glob("case-gcnv-calls-shard-~{scatter_index}-sample-*.tar.gz") + File gcnv_calls_tar = "case-gcnv-calls-shard-~{scatter_index}.tar.gz" File gcnv_tracking_tar = "case-gcnv-tracking-shard-~{scatter_index}.tar.gz" File calling_config_json = "~{output_dir_}/case-calls/calling_config.json" File denoising_config_json = "~{output_dir_}/case-calls/denoising_config.json" diff --git a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl index 66e54f2f39c..dc0a9af4123 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl @@ -87,6 +87,7 @@ workflow CNVGermlineCohortWorkflow { ############################################## #### optional arguments for CollectCounts #### ############################################## + Array[String]? disabled_read_filters_for_collect_counts String? collect_counts_format Boolean? collect_counts_enable_indexing Int? mem_gb_for_collect_counts @@ -148,10 +149,18 @@ workflow CNVGermlineCohortWorkflow { Float? gcnv_caller_external_admixing_rate Boolean? gcnv_disable_annealing + ###################################################### + #### arguments for BundlePostprocessingInvariants #### + ###################################################### + Int? mem_gb_for_bundle_postprocessing_invariants + Int? disk_space_gb_for_bundle_postprocessing_invariants + ################################################### #### arguments for PostprocessGermlineCNVCalls #### ################################################### Int ref_copy_number_autosomal_contigs + Int? mem_gb_for_postprocess_germline_cnv_calls + Int? disk_space_gb_for_postprocess_germline_cnv_calls Array[String]? allosomal_contigs ########################## @@ -206,6 +215,7 @@ workflow CNVGermlineCohortWorkflow { ref_fasta_dict = ref_fasta_dict, format = collect_counts_format, enable_indexing = collect_counts_enable_indexing, + disabled_read_filters = disabled_read_filters_for_collect_counts, gatk4_jar_override = gatk4_jar_override, gatk_docker = gatk_docker, mem_gb = mem_gb_for_collect_counts, @@ -315,30 +325,39 @@ workflow CNVGermlineCohortWorkflow { } } - Array[Array[File]] call_tars_sample_by_shard = transpose(GermlineCNVCallerCohortMode.gcnv_call_tars) + call CNVTasks.BundlePostprocessingInvariants { + input: + calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar, + model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar, + calling_configs = GermlineCNVCallerCohortMode.calling_config_json, + denoising_configs = GermlineCNVCallerCohortMode.denoising_config_json, + gcnvkernel_version = GermlineCNVCallerCohortMode.gcnvkernel_version_json, + sharded_interval_lists = GermlineCNVCallerCohortMode.sharded_interval_list, + docker = gatk_docker, + mem_gb = mem_gb_for_bundle_postprocessing_invariants, + disk_space_gb = disk_space_gb_for_bundle_postprocessing_invariants, + preemptible_attempts = preemptible_attempts + } - scatter (sample_index in range(length(CollectCounts.entity_id))) { - call CNVTasks.PostprocessGermlineCNVCalls { + scatter (sample_index in range(length(normal_bams))) { + call CNVTasks.BundledPostprocessGermlineCNVCalls { input: + invariants_tar = BundlePostprocessingInvariants.bundle_tar, entity_id = CollectCounts.entity_id[sample_index], - gcnv_calls_tars = call_tars_sample_by_shard[sample_index], - gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar, - calling_configs = GermlineCNVCallerCohortMode.calling_config_json, - denoising_configs = GermlineCNVCallerCohortMode.denoising_config_json, - gcnvkernel_version = GermlineCNVCallerCohortMode.gcnvkernel_version_json, - sharded_interval_lists = GermlineCNVCallerCohortMode.sharded_interval_list, - contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar, allosomal_contigs = allosomal_contigs, ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs, + contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar, sample_index = sample_index, gatk4_jar_override = gatk4_jar_override, gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts + preemptible_attempts = preemptible_attempts, + mem_gb = mem_gb_for_postprocess_germline_cnv_calls, + disk_space_gb = disk_space_gb_for_postprocess_germline_cnv_calls } call CNVTasks.CollectSampleQualityMetrics { input: - genotyped_segments_vcf = PostprocessGermlineCNVCalls.genotyped_segments_vcf, + genotyped_segments_vcf = BundledPostprocessGermlineCNVCalls.genotyped_segments_vcf, entity_id = CollectCounts.entity_id[sample_index], maximum_number_events = maximum_number_events_per_sample, gatk_docker = gatk_docker, @@ -353,6 +372,14 @@ workflow CNVGermlineCohortWorkflow { preemptible_attempts = preemptible_attempts } + call CNVTasks.ScatterPloidyCallsBySample { + input : + contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar, + samples = CollectCounts.entity_id, + docker = gatk_docker, + preemptible_attempts = preemptible_attempts + } + output { File preprocessed_intervals = PreprocessIntervals.preprocessed_intervals Array[File] read_counts_entity_ids = CollectCounts.entity_id @@ -360,17 +387,17 @@ workflow CNVGermlineCohortWorkflow { File? annotated_intervals = AnnotateIntervals.annotated_intervals File filtered_intervals = FilterIntervals.filtered_intervals File contig_ploidy_model_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_model_tar - File contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar + Array[File] sample_contig_ploidy_calls_tars = ScatterPloidyCallsBySample.sample_contig_ploidy_calls_tar Array[File] gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar - Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_call_tars + Array[File] gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar Array[File] gcnv_tracking_tars = GermlineCNVCallerCohortMode.gcnv_tracking_tar - Array[File] genotyped_intervals_vcfs = PostprocessGermlineCNVCalls.genotyped_intervals_vcf - Array[File] genotyped_segments_vcfs = PostprocessGermlineCNVCalls.genotyped_segments_vcf + Array[File] genotyped_intervals_vcfs = BundledPostprocessGermlineCNVCalls.genotyped_intervals_vcf + Array[File] genotyped_segments_vcfs = BundledPostprocessGermlineCNVCalls.genotyped_segments_vcf + Array[File] denoised_copy_ratios = BundledPostprocessGermlineCNVCalls.denoised_copy_ratios Array[File] sample_qc_status_files = CollectSampleQualityMetrics.qc_status_file Array[String] sample_qc_status_strings = CollectSampleQualityMetrics.qc_status_string File model_qc_status_file = CollectModelQualityMetrics.qc_status_file String model_qc_string = CollectModelQualityMetrics.qc_status_string - Array[File] denoised_copy_ratios = PostprocessGermlineCNVCalls.denoised_copy_ratios } } @@ -572,17 +599,9 @@ task GermlineCNVCallerCohortMode { --caller-external-admixing-rate ~{default="1.00" caller_external_admixing_rate} \ --disable-annealing ~{default="false" disable_annealing} - tar czf ~{cohort_entity_id}-gcnv-model-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-model . - tar czf ~{cohort_entity_id}-gcnv-tracking-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-tracking . - - CURRENT_SAMPLE=0 - NUM_SAMPLES=~{num_samples} - NUM_DIGITS=${#NUM_SAMPLES} - while [ $CURRENT_SAMPLE -lt $NUM_SAMPLES ]; do - CURRENT_SAMPLE_WITH_LEADING_ZEROS=$(printf "%0${NUM_DIGITS}d" $CURRENT_SAMPLE) - tar czf ~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-calls/SAMPLE_$CURRENT_SAMPLE . - let CURRENT_SAMPLE=CURRENT_SAMPLE+1 - done + tar c -C ~{output_dir_}/~{cohort_entity_id}-tracking . | gzip -1 > ~{cohort_entity_id}-gcnv-tracking-shard-~{scatter_index}.tar.gz + tar c -C ~{output_dir_}/~{cohort_entity_id}-calls . | gzip -1 > ~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}.tar.gz + tar c -C ~{output_dir_}/~{cohort_entity_id}-model . | gzip -1 > ~{cohort_entity_id}-gcnv-model-shard-~{scatter_index}.tar.gz rm -rf contig-ploidy-calls >>> @@ -597,7 +616,7 @@ task GermlineCNVCallerCohortMode { output { File gcnv_model_tar = "~{cohort_entity_id}-gcnv-model-shard-~{scatter_index}.tar.gz" - Array[File] gcnv_call_tars = glob("~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}-sample-*.tar.gz") + File gcnv_calls_tar = "~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}.tar.gz" File gcnv_tracking_tar = "~{cohort_entity_id}-gcnv-tracking-shard-~{scatter_index}.tar.gz" File calling_config_json = "~{output_dir_}/~{cohort_entity_id}-calls/calling_config.json" File denoising_config_json = "~{output_dir_}/~{cohort_entity_id}-calls/denoising_config.json" From a7689b4640228287277787c7e1b4570da1cf4490 Mon Sep 17 00:00:00 2001 From: Mark Walker Date: Fri, 22 May 2020 14:49:05 -0400 Subject: [PATCH 02/10] Start addressing reviewer comments --- scripts/cnv_wdl/cnv_common_tasks.wdl | 16 ++++++------ .../germline/cnv_germline_case_workflow.wdl | 24 ++++++++--------- .../germline/cnv_germline_cohort_workflow.wdl | 26 +++++++++---------- 3 files changed, 32 insertions(+), 34 deletions(-) diff --git a/scripts/cnv_wdl/cnv_common_tasks.wdl b/scripts/cnv_wdl/cnv_common_tasks.wdl index 85835f01061..3f8a8f54914 100644 --- a/scripts/cnv_wdl/cnv_common_tasks.wdl +++ b/scripts/cnv_wdl/cnv_common_tasks.wdl @@ -441,9 +441,9 @@ task ScatterIntervals { } } -task BundledPostprocessGermlineCNVCalls { +task PostprocessGermlineCNVCalls { input { - File invariants_tar + File bundled_gcnv_outputs String entity_id File contig_ploidy_calls_tar Array[String]? allosomal_contigs @@ -463,10 +463,10 @@ task BundledPostprocessGermlineCNVCalls { Int machine_mem_mb = select_first([mem_gb, 7]) * 1000 Int command_mem_mb = machine_mem_mb - 1000 - Float invariants_size = size(invariants_tar, "GiB") + Float bundled_gcnv_outputs_size = size(bundled_gcnv_outputs, "GiB") Float disk_overhead = 20.0 Float tar_disk_factor= 5.0 - Int vm_disk_size = ceil(tar_disk_factor * invariants_size + disk_overhead) + Int vm_disk_size = ceil(tar_disk_factor * bundled_gcnv_outputs_size + disk_overhead) String genotyped_intervals_vcf_filename = "genotyped-intervals-~{entity_id}.vcf.gz" String genotyped_segments_vcf_filename = "genotyped-segments-~{entity_id}.vcf.gz" @@ -481,8 +481,8 @@ task BundledPostprocessGermlineCNVCalls { # untar calls to CALLS_0, CALLS_1, etc directories and build the command line # also copy over shard config and interval files - tar xzf ~{invariants_tar} - rm ~{invariants_tar} + tar xzf ~{bundled_gcnv_outputs} + rm ~{bundled_gcnv_outputs} number_of_shards=`find . -name 'CALLS_*' | wc -l` touch calls_and_model_args.txt @@ -495,7 +495,7 @@ task BundledPostprocessGermlineCNVCalls { tar xzf ~{contig_ploidy_calls_tar} -C extracted-contig-ploidy-calls rm ~{contig_ploidy_calls_tar} - time gatk --java-options "-Xmx~{command_mem_mb}m" PostprocessGermlineCNVCalls \ + gatk --java-options "-Xmx~{command_mem_mb}m" PostprocessGermlineCNVCalls \ --arguments_file calls_and_model_args.txt \ ~{sep=" " allosomal_contigs_args} \ --autosomal-ref-copy-number ~{ref_copy_number_autosomal_contigs} \ @@ -617,7 +617,7 @@ task CollectModelQualityMetrics { } } -task BundlePostprocessingInvariants { +task BundleCallerOutputs { input { Array[File] calls_tars Array[File] model_tars diff --git a/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl index d13cee99c10..d643699c1f4 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl @@ -113,10 +113,10 @@ workflow CNVGermlineCaseWorkflow { Boolean? gcnv_disable_annealing ###################################################### - #### arguments for BundlePostprocessingInvariants #### + #### arguments for BundleCallerOutputs #### ###################################################### - Int? mem_gb_for_bundle_postprocessing_invariants - Int? disk_space_gb_for_bundle_postprocessing_invariants + Int? mem_gb_for_bundle_caller_outputs + Int? disk_space_gb_for_bundle_caller_outputs ################################################### #### arguments for PostprocessGermlineCNVCalls #### @@ -232,7 +232,7 @@ workflow CNVGermlineCaseWorkflow { } } - call CNVTasks.BundlePostprocessingInvariants { + call CNVTasks.BundleCallerOutputs { input: calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar, model_tars = gcnv_model_tars, @@ -241,15 +241,15 @@ workflow CNVGermlineCaseWorkflow { gcnvkernel_version = GermlineCNVCallerCaseMode.gcnvkernel_version_json, sharded_interval_lists = GermlineCNVCallerCaseMode.sharded_interval_list, docker = gatk_docker, - mem_gb = mem_gb_for_bundle_postprocessing_invariants, - disk_space_gb = disk_space_gb_for_bundle_postprocessing_invariants, + mem_gb = mem_gb_for_bundle_caller_outputs, + disk_space_gb = disk_space_gb_for_bundle_caller_outputs, preemptible_attempts = preemptible_attempts } scatter (sample_index in range(length(normal_bams))) { - call CNVTasks.BundledPostprocessGermlineCNVCalls { + call CNVTasks.PostprocessGermlineCNVCalls { input: - invariants_tar = BundlePostprocessingInvariants.bundle_tar, + bundled_gcnv_outputs = BundleCallerOutputs.bundle_tar, entity_id = CollectCounts.entity_id[sample_index], allosomal_contigs = allosomal_contigs, ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs, @@ -264,7 +264,7 @@ workflow CNVGermlineCaseWorkflow { call CNVTasks.CollectSampleQualityMetrics { input: - genotyped_segments_vcf = BundledPostprocessGermlineCNVCalls.genotyped_segments_vcf, + genotyped_segments_vcf = PostprocessGermlineCNVCalls.genotyped_segments_vcf, entity_id = CollectCounts.entity_id[sample_index], maximum_number_events = maximum_number_events_per_sample, gatk_docker = gatk_docker, @@ -287,9 +287,9 @@ workflow CNVGermlineCaseWorkflow { Array[File] sample_contig_ploidy_calls_tars = ScatterPloidyCallsBySample.sample_contig_ploidy_calls_tar Array[File] gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar Array[File] gcnv_tracking_tars = GermlineCNVCallerCaseMode.gcnv_tracking_tar - Array[File] genotyped_intervals_vcf = BundledPostprocessGermlineCNVCalls.genotyped_intervals_vcf - Array[File] genotyped_segments_vcf = BundledPostprocessGermlineCNVCalls.genotyped_segments_vcf - Array[File] denoised_copy_ratios = BundledPostprocessGermlineCNVCalls.denoised_copy_ratios + Array[File] genotyped_intervals_vcf = PostprocessGermlineCNVCalls.genotyped_intervals_vcf + Array[File] genotyped_segments_vcf = PostprocessGermlineCNVCalls.genotyped_segments_vcf + Array[File] denoised_copy_ratios = PostprocessGermlineCNVCalls.denoised_copy_ratios Array[File] qc_status_files = CollectSampleQualityMetrics.qc_status_file Array[String] qc_status_strings = CollectSampleQualityMetrics.qc_status_string } diff --git a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl index dc0a9af4123..adc699ae306 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl @@ -150,10 +150,10 @@ workflow CNVGermlineCohortWorkflow { Boolean? gcnv_disable_annealing ###################################################### - #### arguments for BundlePostprocessingInvariants #### + #### arguments for BundleCallerOutputs #### ###################################################### - Int? mem_gb_for_bundle_postprocessing_invariants - Int? disk_space_gb_for_bundle_postprocessing_invariants + Int? mem_gb_for_bundle_caller_outputs + Int? disk_space_gb_for_bundle_caller_outputs ################################################### #### arguments for PostprocessGermlineCNVCalls #### @@ -325,7 +325,7 @@ workflow CNVGermlineCohortWorkflow { } } - call CNVTasks.BundlePostprocessingInvariants { + call CNVTasks.BundleCallerOutputs { input: calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar, model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar, @@ -334,15 +334,15 @@ workflow CNVGermlineCohortWorkflow { gcnvkernel_version = GermlineCNVCallerCohortMode.gcnvkernel_version_json, sharded_interval_lists = GermlineCNVCallerCohortMode.sharded_interval_list, docker = gatk_docker, - mem_gb = mem_gb_for_bundle_postprocessing_invariants, - disk_space_gb = disk_space_gb_for_bundle_postprocessing_invariants, + mem_gb = mem_gb_for_bundle_caller_outputs, + disk_space_gb = disk_space_gb_for_bundle_caller_outputs, preemptible_attempts = preemptible_attempts } scatter (sample_index in range(length(normal_bams))) { - call CNVTasks.BundledPostprocessGermlineCNVCalls { + call CNVTasks.PostprocessGermlineCNVCalls { input: - invariants_tar = BundlePostprocessingInvariants.bundle_tar, + bundled_gcnv_outputs = BundleCallerOutputs.bundle_tar, entity_id = CollectCounts.entity_id[sample_index], allosomal_contigs = allosomal_contigs, ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs, @@ -357,7 +357,7 @@ workflow CNVGermlineCohortWorkflow { call CNVTasks.CollectSampleQualityMetrics { input: - genotyped_segments_vcf = BundledPostprocessGermlineCNVCalls.genotyped_segments_vcf, + genotyped_segments_vcf = PostprocessGermlineCNVCalls.genotyped_segments_vcf, entity_id = CollectCounts.entity_id[sample_index], maximum_number_events = maximum_number_events_per_sample, gatk_docker = gatk_docker, @@ -391,9 +391,9 @@ workflow CNVGermlineCohortWorkflow { Array[File] gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar Array[File] gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar Array[File] gcnv_tracking_tars = GermlineCNVCallerCohortMode.gcnv_tracking_tar - Array[File] genotyped_intervals_vcfs = BundledPostprocessGermlineCNVCalls.genotyped_intervals_vcf - Array[File] genotyped_segments_vcfs = BundledPostprocessGermlineCNVCalls.genotyped_segments_vcf - Array[File] denoised_copy_ratios = BundledPostprocessGermlineCNVCalls.denoised_copy_ratios + Array[File] genotyped_intervals_vcfs = PostprocessGermlineCNVCalls.genotyped_intervals_vcf + Array[File] genotyped_segments_vcfs = PostprocessGermlineCNVCalls.genotyped_segments_vcf + Array[File] denoised_copy_ratios = PostprocessGermlineCNVCalls.denoised_copy_ratios Array[File] sample_qc_status_files = CollectSampleQualityMetrics.qc_status_file Array[String] sample_qc_status_strings = CollectSampleQualityMetrics.qc_status_string File model_qc_status_file = CollectModelQualityMetrics.qc_status_file @@ -540,8 +540,6 @@ task GermlineCNVCallerCohortMode { String output_dir_ = select_first([output_dir, "out"]) Int num_samples = length(read_count_files) - String dollar = "$" #WDL workaround, see https://github.com/broadinstitute/cromwell/issues/1819 - command <<< set -eu export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override} From af0d56f554452607eafcdfdf4efd0fbdc025d5b7 Mon Sep 17 00:00:00 2001 From: Mark Walker Date: Thu, 28 May 2020 13:37:02 -0400 Subject: [PATCH 03/10] Transpose instead of bundling --- scripts/cnv_wdl/cnv_common_tasks.wdl | 123 ++++++++---------- .../germline/cnv_germline_case_workflow.wdl | 19 +-- .../germline/cnv_germline_cohort_workflow.wdl | 22 ++-- 3 files changed, 76 insertions(+), 88 deletions(-) diff --git a/scripts/cnv_wdl/cnv_common_tasks.wdl b/scripts/cnv_wdl/cnv_common_tasks.wdl index 3f8a8f54914..5a13ed20c6a 100644 --- a/scripts/cnv_wdl/cnv_common_tasks.wdl +++ b/scripts/cnv_wdl/cnv_common_tasks.wdl @@ -443,7 +443,12 @@ task ScatterIntervals { task PostprocessGermlineCNVCalls { input { - File bundled_gcnv_outputs + File gcnv_calls_sample_tar + Array[File] gcnv_model_tars + Array[File] calling_configs + Array[File] denoising_configs + Array[File] gcnvkernel_version + Array[File] sharded_interval_lists String entity_id File contig_ploidy_calls_tar Array[String]? allosomal_contigs @@ -463,11 +468,6 @@ task PostprocessGermlineCNVCalls { Int machine_mem_mb = select_first([mem_gb, 7]) * 1000 Int command_mem_mb = machine_mem_mb - 1000 - Float bundled_gcnv_outputs_size = size(bundled_gcnv_outputs, "GiB") - Float disk_overhead = 20.0 - Float tar_disk_factor= 5.0 - Int vm_disk_size = ceil(tar_disk_factor * bundled_gcnv_outputs_size + disk_overhead) - String genotyped_intervals_vcf_filename = "genotyped-intervals-~{entity_id}.vcf.gz" String genotyped_segments_vcf_filename = "genotyped-segments-~{entity_id}.vcf.gz" String denoised_copy_ratios_filename = "denoised_copy_ratios-~{entity_id}.tsv" @@ -476,30 +476,45 @@ task PostprocessGermlineCNVCalls { command <<< set -euo pipefail - export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override} + sharded_interval_lists_array=(~{sep=" " sharded_interval_lists}) + # untar calls to CALLS_0, CALLS_1, etc directories and build the command line # also copy over shard config and interval files - tar xzf ~{bundled_gcnv_outputs} - rm ~{bundled_gcnv_outputs} - number_of_shards=`find . -name 'CALLS_*' | wc -l` - - touch calls_and_model_args.txt - for i in $(seq 0 `expr $number_of_shards - 1`); do - echo "--calls-shard-path CALLS_$i" >> calls_and_model_args.txt - echo "--model-shard-path MODEL_$i" >> calls_and_model_args.txt + calling_configs_array=(~{sep=" " calling_configs}) + denoising_configs_array=(~{sep=" " denoising_configs}) + gcnvkernel_version_array=(~{sep=" " gcnvkernel_version}) + sharded_interval_lists_array=(~{sep=" " sharded_interval_lists}) + calls_args="" + tar xzf ~{gcnv_calls_sample_tar} + for index in ${!calling_configs_array[@]}; do + cp ${calling_configs_array[$index]} CALLS_$index/ + cp ${denoising_configs_array[$index]} CALLS_$index/ + cp ${gcnvkernel_version_array[$index]} CALLS_$index/ + cp ${sharded_interval_lists_array[$index]} CALLS_$index/ + calls_args="$calls_args --calls-shard-path CALLS_$index" done - mkdir -p extracted-contig-ploidy-calls - tar xzf ~{contig_ploidy_calls_tar} -C extracted-contig-ploidy-calls - rm ~{contig_ploidy_calls_tar} + # untar models to MODEL_0, MODEL_1, etc directories and build the command line + gcnv_model_tar_array=(~{sep=" " gcnv_model_tars}) + model_args="" + for index in ${!gcnv_model_tar_array[@]}; do + gcnv_model_tar=${gcnv_model_tar_array[$index]} + mkdir MODEL_$index + tar xzf $gcnv_model_tar -C MODEL_$index + model_args="$model_args --model-shard-path MODEL_$index" + done + + mkdir contig-ploidy-calls + tar xzf ~{contig_ploidy_calls_tar} -C contig-ploidy-calls gatk --java-options "-Xmx~{command_mem_mb}m" PostprocessGermlineCNVCalls \ - --arguments_file calls_and_model_args.txt \ + $calls_args \ + $model_args \ ~{sep=" " allosomal_contigs_args} \ --autosomal-ref-copy-number ~{ref_copy_number_autosomal_contigs} \ - --contig-ploidy-calls extracted-contig-ploidy-calls \ + --contig-ploidy-calls contig-ploidy-calls \ --sample-index ~{sample_index} \ --output-genotyped-intervals ~{genotyped_intervals_vcf_filename} \ --output-genotyped-segments ~{genotyped_segments_vcf_filename} \ @@ -507,13 +522,13 @@ task PostprocessGermlineCNVCalls { rm -rf CALLS_* rm -rf MODEL_* - rm -rf extracted-contig-ploidy-calls + rm -rf contig-ploidy-calls >>> runtime { docker: gatk_docker memory: machine_mem_mb + " MB" - disks: "local-disk " + select_first([disk_space_gb, vm_disk_size]) + if use_ssd then " SSD" else " HDD" + disks: "local-disk " + select_first([disk_space_gb, 40]) + if use_ssd then " SSD" else " HDD" cpu: select_first([cpu, 1]) preemptible: select_first([preemptible_attempts, 5]) maxRetries: 1 @@ -617,14 +632,9 @@ task CollectModelQualityMetrics { } } -task BundleCallerOutputs { +task TransposeCallerOutputs { input { - Array[File] calls_tars - Array[File] model_tars - Array[File] calling_configs - Array[File] denoising_configs - Array[File] gcnvkernel_version - Array[File] sharded_interval_lists + Array[File] gcnv_calls_tars # Runtime parameters String docker @@ -637,44 +647,21 @@ task BundleCallerOutputs { command <<< set -euo pipefail - mkdir -p out - - calls_files_tar_list=~{write_lines(calls_tars)} - model_files_tar_list=~{write_lines(model_tars)} - - calling_configs_list=~{write_lines(calling_configs)} - denoising_configs_list=~{write_lines(denoising_configs)} - gcnvkernel_version_list=~{write_lines(gcnvkernel_version)} - sharded_interval_lists_list=~{write_lines(sharded_interval_lists)} - - cat $calls_files_tar_list | sort -V > calls_files_tar_list.sorted - cat $model_files_tar_list | sort -V > model_files_tar_list.sorted - - cat $calling_configs_list | sort -V > calling_configs_list.sorted - cat $denoising_configs_list | sort -V > denoising_configs_list.sorted - cat $gcnvkernel_version_list | sort -V > gcnvkernel_version_list.sorted - cat $sharded_interval_lists_list | sort -V > sharded_interval_lists_list.sorted - - paste calls_files_tar_list.sorted model_files_tar_list.sorted calling_configs_list.sorted denoising_configs_list.sorted gcnvkernel_version_list.sorted sharded_interval_lists_list.sorted |\ - awk '{print (NR-1)"\t"$0}' > file_sets.sorted - OIFS=$IFS - IFS=$'\t' - while read index calls_tar model_tar call_config denoise version intervals; do - mkdir -p out/CALLS_$index - mkdir -p out/MODEL_$index - tar xzf $calls_tar -C out/CALLS_$index - tar xzf $model_tar -C out/MODEL_$index - cp $call_config out/CALLS_$index - cp $denoise out/CALLS_$index - cp $version out/CALLS_$index - cp $intervals out/CALLS_$index - rm $calls_tar $model_tar $call_config $denoise $version $intervals - - done < file_sets.sorted - IFS=$OIFS - - tar c -C out . | gzip -1 > case-gcnv-postprocessing-invariants.tar.gz - rm -Rf out + + gcnv_calls_tar_array=(~{sep=" " gcnv_calls_tars}) + for index in ${!gcnv_calls_tar_array[@]}; do + mkdir CALLS_$index + tar xzf ${gcnv_calls_tar_array[$index]} -C CALLS_$index + done + + CURRENT_SAMPLE=0 + NUM_SAMPLES=$(ls -d CALLS_0/SAMPLE_* | wc -l) + NUM_DIGITS=${#NUM_SAMPLES} + while [ $CURRENT_SAMPLE -lt $NUM_SAMPLES ]; do + CURRENT_SAMPLE_WITH_LEADING_ZEROS=$(printf "%0${NUM_DIGITS}d" $CURRENT_SAMPLE) + tar c CALLS_*/SAMPLE_$CURRENT_SAMPLE | gzip -1 > case-gcnv-calls-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz + let CURRENT_SAMPLE=CURRENT_SAMPLE+1 + done >>> runtime { @@ -686,7 +673,7 @@ task BundleCallerOutputs { } output { - File bundle_tar = "case-gcnv-postprocessing-invariants.tar.gz" + Array[File] gcnv_calls_sample_tars = glob("case-gcnv-calls-sample-*.tar.gz") } } diff --git a/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl index d643699c1f4..fc08c141999 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl @@ -232,14 +232,9 @@ workflow CNVGermlineCaseWorkflow { } } - call CNVTasks.BundleCallerOutputs { + call CNVTasks.TransposeCallerOutputs { input: - calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar, - model_tars = gcnv_model_tars, - calling_configs = GermlineCNVCallerCaseMode.calling_config_json, - denoising_configs = GermlineCNVCallerCaseMode.denoising_config_json, - gcnvkernel_version = GermlineCNVCallerCaseMode.gcnvkernel_version_json, - sharded_interval_lists = GermlineCNVCallerCaseMode.sharded_interval_list, + gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar, docker = gatk_docker, mem_gb = mem_gb_for_bundle_caller_outputs, disk_space_gb = disk_space_gb_for_bundle_caller_outputs, @@ -247,9 +242,15 @@ workflow CNVGermlineCaseWorkflow { } scatter (sample_index in range(length(normal_bams))) { + call CNVTasks.PostprocessGermlineCNVCalls { input: - bundled_gcnv_outputs = BundleCallerOutputs.bundle_tar, + gcnv_calls_sample_tar = TransposeCallerOutputs.gcnv_calls_sample_tars[sample_index], + gcnv_model_tars = gcnv_model_tars, + calling_configs = GermlineCNVCallerCaseMode.calling_config_json, + denoising_configs = GermlineCNVCallerCaseMode.denoising_config_json, + gcnvkernel_version = GermlineCNVCallerCaseMode.gcnvkernel_version_json, + sharded_interval_lists = GermlineCNVCallerCaseMode.sharded_interval_list, entity_id = CollectCounts.entity_id[sample_index], allosomal_contigs = allosomal_contigs, ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs, @@ -341,7 +342,7 @@ task DetermineGermlineContigPloidyCaseMode { --mapping-error-rate ~{default="0.01" mapping_error_rate} \ --sample-psi-scale ~{default="0.0001" sample_psi_scale} - tar czf case-contig-ploidy-calls.tar.gz -C ~{output_dir_}/case-calls . + tar c -C ~{output_dir_}/case-calls . | gzip -1 > case-contig-ploidy-calls.tar.gz rm -rf contig-ploidy-model >>> diff --git a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl index adc699ae306..2ad50a50008 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl @@ -325,14 +325,9 @@ workflow CNVGermlineCohortWorkflow { } } - call CNVTasks.BundleCallerOutputs { + call CNVTasks.TransposeCallerOutputs { input: - calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar, - model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar, - calling_configs = GermlineCNVCallerCohortMode.calling_config_json, - denoising_configs = GermlineCNVCallerCohortMode.denoising_config_json, - gcnvkernel_version = GermlineCNVCallerCohortMode.gcnvkernel_version_json, - sharded_interval_lists = GermlineCNVCallerCohortMode.sharded_interval_list, + gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar, docker = gatk_docker, mem_gb = mem_gb_for_bundle_caller_outputs, disk_space_gb = disk_space_gb_for_bundle_caller_outputs, @@ -342,7 +337,12 @@ workflow CNVGermlineCohortWorkflow { scatter (sample_index in range(length(normal_bams))) { call CNVTasks.PostprocessGermlineCNVCalls { input: - bundled_gcnv_outputs = BundleCallerOutputs.bundle_tar, + gcnv_calls_sample_tar = TransposeCallerOutputs.gcnv_calls_sample_tars[sample_index], + gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar, + calling_configs = GermlineCNVCallerCohortMode.calling_config_json, + denoising_configs = GermlineCNVCallerCohortMode.denoising_config_json, + gcnvkernel_version = GermlineCNVCallerCohortMode.gcnvkernel_version_json, + sharded_interval_lists = GermlineCNVCallerCohortMode.sharded_interval_list, entity_id = CollectCounts.entity_id[sample_index], allosomal_contigs = allosomal_contigs, ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs, @@ -597,9 +597,9 @@ task GermlineCNVCallerCohortMode { --caller-external-admixing-rate ~{default="1.00" caller_external_admixing_rate} \ --disable-annealing ~{default="false" disable_annealing} - tar c -C ~{output_dir_}/~{cohort_entity_id}-tracking . | gzip -1 > ~{cohort_entity_id}-gcnv-tracking-shard-~{scatter_index}.tar.gz - tar c -C ~{output_dir_}/~{cohort_entity_id}-calls . | gzip -1 > ~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}.tar.gz - tar c -C ~{output_dir_}/~{cohort_entity_id}-model . | gzip -1 > ~{cohort_entity_id}-gcnv-model-shard-~{scatter_index}.tar.gz + tar czf ~{cohort_entity_id}-gcnv-tracking-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-tracking . + tar czf ~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-calls . + tar czf ~{cohort_entity_id}-gcnv-model-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-model . rm -rf contig-ploidy-calls >>> From 5f2949227728dec920e66f7423850db463474c40 Mon Sep 17 00:00:00 2001 From: Mark Walker Date: Wed, 3 Jun 2020 17:35:49 -0400 Subject: [PATCH 04/10] Address comments; bundle auxiliary calls files --- scripts/cnv_wdl/cnv_common_tasks.wdl | 41 ++++++------------- .../cnv_germline_case_scattered_workflow.wdl | 12 +++++- .../germline/cnv_germline_case_workflow.wdl | 25 ++++------- .../germline/cnv_germline_cohort_workflow.wdl | 19 ++++----- 4 files changed, 40 insertions(+), 57 deletions(-) diff --git a/scripts/cnv_wdl/cnv_common_tasks.wdl b/scripts/cnv_wdl/cnv_common_tasks.wdl index 5a13ed20c6a..0ad35bbf265 100644 --- a/scripts/cnv_wdl/cnv_common_tasks.wdl +++ b/scripts/cnv_wdl/cnv_common_tasks.wdl @@ -215,14 +215,8 @@ task CollectCounts { Int command_mem_mb = machine_mem_mb - 1000 Boolean enable_indexing_ = select_first([enable_indexing, false]) - Array[String] disabled_read_filters_arr = if(defined(disabled_read_filters)) - then - prefix( - "--disable-read-filter ", - select_first([disabled_read_filters]) - ) - else - [] + + Array[String] disabled_read_filters_arr = if defined(disabled_read_filters) then prefix("--disable-read-filter ", select_first([disabled_read_filters])) else [] # Sample name is derived from the bam filename String base_filename = basename(bam, ".bam") @@ -445,10 +439,7 @@ task PostprocessGermlineCNVCalls { input { File gcnv_calls_sample_tar Array[File] gcnv_model_tars - Array[File] calling_configs - Array[File] denoising_configs - Array[File] gcnvkernel_version - Array[File] sharded_interval_lists + File gcnv_shard_configs_tar String entity_id File contig_ploidy_calls_tar Array[String]? allosomal_contigs @@ -478,22 +469,12 @@ task PostprocessGermlineCNVCalls { set -euo pipefail export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override} - sharded_interval_lists_array=(~{sep=" " sharded_interval_lists}) - # untar calls to CALLS_0, CALLS_1, etc directories and build the command line - # also copy over shard config and interval files - calling_configs_array=(~{sep=" " calling_configs}) - denoising_configs_array=(~{sep=" " denoising_configs}) - gcnvkernel_version_array=(~{sep=" " gcnvkernel_version}) - sharded_interval_lists_array=(~{sep=" " sharded_interval_lists}) - calls_args="" tar xzf ~{gcnv_calls_sample_tar} - for index in ${!calling_configs_array[@]}; do - cp ${calling_configs_array[$index]} CALLS_$index/ - cp ${denoising_configs_array[$index]} CALLS_$index/ - cp ${gcnvkernel_version_array[$index]} CALLS_$index/ - cp ${sharded_interval_lists_array[$index]} CALLS_$index/ - calls_args="$calls_args --calls-shard-path CALLS_$index" + tar xzf ~{gcnv_shard_configs_tar} + calls_args="" + for calls_dir in CALLS_*; do + calls_args="$calls_args --calls-shard-path $calls_dir" done # untar models to MODEL_0, MODEL_1, etc directories and build the command line @@ -659,9 +640,12 @@ task TransposeCallerOutputs { NUM_DIGITS=${#NUM_SAMPLES} while [ $CURRENT_SAMPLE -lt $NUM_SAMPLES ]; do CURRENT_SAMPLE_WITH_LEADING_ZEROS=$(printf "%0${NUM_DIGITS}d" $CURRENT_SAMPLE) - tar c CALLS_*/SAMPLE_$CURRENT_SAMPLE | gzip -1 > case-gcnv-calls-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz + tar c CALLS_*/SAMPLE_$CURRENT_SAMPLE | gzip -1 > gcnv-calls-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz let CURRENT_SAMPLE=CURRENT_SAMPLE+1 done + + rm -r CALLS_*/SAMPLE_* + tar c CALLS_* | gzip -1 > gcnv-shard-configs.tar.gz >>> runtime { @@ -673,7 +657,8 @@ task TransposeCallerOutputs { } output { - Array[File] gcnv_calls_sample_tars = glob("case-gcnv-calls-sample-*.tar.gz") + Array[File] gcnv_calls_sample_tars = glob("gcnv-calls-sample-*.tar.gz") + File gcnv_shard_configs_tar = "gcnv-shard-configs.tar.gz" } } diff --git a/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl index ec1c8c6636b..e78376d9426 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl @@ -102,6 +102,12 @@ workflow CNVGermlineCaseScatteredWorkflow { Float? gcnv_caller_external_admixing_rate Boolean? gcnv_disable_annealing + ############################################## + #### arguments for TransposeCallerOutputs #### + ############################################## + Int? mem_gb_for_transpose_caller_outputs + Int? disk_space_gb_for_transpose_caller_outputs + ################################################### #### arguments for PostprocessGermlineCNVCalls #### ################################################### @@ -191,6 +197,8 @@ workflow CNVGermlineCaseScatteredWorkflow { gcnv_caller_external_admixing_rate = gcnv_caller_external_admixing_rate, gcnv_disable_annealing = gcnv_disable_annealing, ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs, + mem_gb_for_transpose_caller_outputs = mem_gb_for_transpose_caller_outputs, + disk_space_gb_for_transpose_caller_outputs = disk_space_gb_for_transpose_caller_outputs, allosomal_contigs = allosomal_contigs, maximum_number_events_per_sample = maximum_number_events_per_sample } @@ -201,8 +209,8 @@ workflow CNVGermlineCaseScatteredWorkflow { Array[File] read_counts_entity_id = flatten(CNVGermlineCaseWorkflow.read_counts_entity_id) Array[File] read_counts = flatten(CNVGermlineCaseWorkflow.read_counts) Array[File] sample_contig_ploidy_calls_tars = flatten(CNVGermlineCaseWorkflow.sample_contig_ploidy_calls_tars) - Array[File] gcnv_calls_tars = flatten(CNVGermlineCaseWorkflow.gcnv_calls_tars) - Array[File] gcnv_tracking_tars = flatten(CNVGermlineCaseWorkflow.gcnv_tracking_tars) + Array[File] gcnv_calls_tars = CNVGermlineCaseWorkflow.gcnv_calls_tars + Array[File] gcnv_tracking_tars = CNVGermlineCaseWorkflow.gcnv_tracking_tars Array[File] genotyped_intervals_vcf = flatten(CNVGermlineCaseWorkflow.genotyped_intervals_vcf) Array[File] genotyped_segments_vcf = flatten(CNVGermlineCaseWorkflow.genotyped_segments_vcf) Array[File] denoised_copy_ratios = flatten(CNVGermlineCaseWorkflow.denoised_copy_ratios) diff --git a/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl index fc08c141999..f127044f2f2 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl @@ -112,11 +112,11 @@ workflow CNVGermlineCaseWorkflow { Float? gcnv_caller_external_admixing_rate Boolean? gcnv_disable_annealing - ###################################################### - #### arguments for BundleCallerOutputs #### - ###################################################### - Int? mem_gb_for_bundle_caller_outputs - Int? disk_space_gb_for_bundle_caller_outputs + ############################################## + #### arguments for TransposeCallerOutputs #### + ############################################## + Int? mem_gb_for_transpose_caller_outputs + Int? disk_space_gb_for_transpose_caller_outputs ################################################### #### arguments for PostprocessGermlineCNVCalls #### @@ -236,8 +236,8 @@ workflow CNVGermlineCaseWorkflow { input: gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar, docker = gatk_docker, - mem_gb = mem_gb_for_bundle_caller_outputs, - disk_space_gb = disk_space_gb_for_bundle_caller_outputs, + mem_gb = mem_gb_for_transpose_caller_outputs, + disk_space_gb = disk_space_gb_for_transpose_caller_outputs, preemptible_attempts = preemptible_attempts } @@ -247,10 +247,7 @@ workflow CNVGermlineCaseWorkflow { input: gcnv_calls_sample_tar = TransposeCallerOutputs.gcnv_calls_sample_tars[sample_index], gcnv_model_tars = gcnv_model_tars, - calling_configs = GermlineCNVCallerCaseMode.calling_config_json, - denoising_configs = GermlineCNVCallerCaseMode.denoising_config_json, - gcnvkernel_version = GermlineCNVCallerCaseMode.gcnvkernel_version_json, - sharded_interval_lists = GermlineCNVCallerCaseMode.sharded_interval_list, + gcnv_shard_configs_tar = TransposeCallerOutputs.gcnv_shard_configs_tar, entity_id = CollectCounts.entity_id[sample_index], allosomal_contigs = allosomal_contigs, ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs, @@ -468,8 +465,8 @@ task GermlineCNVCallerCaseMode { --caller-external-admixing-rate ~{default="1.00" caller_external_admixing_rate} \ --disable-annealing ~{default="false" disable_annealing} - tar czf case-gcnv-tracking-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/case-tracking . tar czf case-gcnv-calls-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/case-calls . + tar czf case-gcnv-tracking-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/case-tracking . rm -rf contig-ploidy-calls rm -rf gcnv-model @@ -486,9 +483,5 @@ task GermlineCNVCallerCaseMode { output { File gcnv_calls_tar = "case-gcnv-calls-shard-~{scatter_index}.tar.gz" File gcnv_tracking_tar = "case-gcnv-tracking-shard-~{scatter_index}.tar.gz" - File calling_config_json = "~{output_dir_}/case-calls/calling_config.json" - File denoising_config_json = "~{output_dir_}/case-calls/denoising_config.json" - File gcnvkernel_version_json = "~{output_dir_}/case-calls/gcnvkernel_version.json" - File sharded_interval_list = "~{output_dir_}/case-calls/interval_list.tsv" } } diff --git a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl index 2ad50a50008..3b68a555fba 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl @@ -149,11 +149,11 @@ workflow CNVGermlineCohortWorkflow { Float? gcnv_caller_external_admixing_rate Boolean? gcnv_disable_annealing - ###################################################### - #### arguments for BundleCallerOutputs #### - ###################################################### - Int? mem_gb_for_bundle_caller_outputs - Int? disk_space_gb_for_bundle_caller_outputs + ############################################## + #### arguments for TransposeCallerOutputs #### + ############################################## + Int? mem_gb_for_transpose_caller_outputs + Int? disk_space_gb_for_transpose_caller_outputs ################################################### #### arguments for PostprocessGermlineCNVCalls #### @@ -329,8 +329,8 @@ workflow CNVGermlineCohortWorkflow { input: gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar, docker = gatk_docker, - mem_gb = mem_gb_for_bundle_caller_outputs, - disk_space_gb = disk_space_gb_for_bundle_caller_outputs, + mem_gb = mem_gb_for_transpose_caller_outputs, + disk_space_gb = disk_space_gb_for_transpose_caller_outputs, preemptible_attempts = preemptible_attempts } @@ -339,10 +339,7 @@ workflow CNVGermlineCohortWorkflow { input: gcnv_calls_sample_tar = TransposeCallerOutputs.gcnv_calls_sample_tars[sample_index], gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar, - calling_configs = GermlineCNVCallerCohortMode.calling_config_json, - denoising_configs = GermlineCNVCallerCohortMode.denoising_config_json, - gcnvkernel_version = GermlineCNVCallerCohortMode.gcnvkernel_version_json, - sharded_interval_lists = GermlineCNVCallerCohortMode.sharded_interval_list, + gcnv_shard_configs_tar = TransposeCallerOutputs.gcnv_shard_configs_tar, entity_id = CollectCounts.entity_id[sample_index], allosomal_contigs = allosomal_contigs, ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs, From d29a0051fa080438a69aeb87dcedee70e2f1532f Mon Sep 17 00:00:00 2001 From: Mark Walker Date: Tue, 9 Jun 2020 09:59:36 -0400 Subject: [PATCH 05/10] Pad sample index in ScatterPloidyCallsBySample --- scripts/cnv_wdl/cnv_common_tasks.wdl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/cnv_wdl/cnv_common_tasks.wdl b/scripts/cnv_wdl/cnv_common_tasks.wdl index 0ad35bbf265..0f6e440ebf9 100644 --- a/scripts/cnv_wdl/cnv_common_tasks.wdl +++ b/scripts/cnv_wdl/cnv_common_tasks.wdl @@ -688,11 +688,13 @@ task ScatterPloidyCallsBySample { # Archive call files by sample, renaming so they will be glob'd in order sample_ids=(~{sep=" " samples}) + num_samples=~{num_samples} + num_digits=${#num_samples} for (( i=0; i<~{num_samples}; i++ )) do sample_id=${sample_ids[$i]} - sample_no=`printf %04d $i` - tar -czf sample_${sample_no}.${sample_id}.contig_ploidy_calls.tar.gz -C calls/SAMPLE_${i} . + padded_sample_index=$(printf "%0${num_digits}d" $i) + tar -czf sample_${padded_sample_index}.${sample_id}.contig_ploidy_calls.tar.gz -C calls/SAMPLE_${i} . done >>> runtime { From e00692bc41b3285b65e2313d0bf18e7a8be800e7 Mon Sep 17 00:00:00 2001 From: Mark Walker Date: Thu, 11 Jun 2020 12:29:59 -0400 Subject: [PATCH 06/10] Fix unflattened output error in CNVGermlineCaseScatteredWorkflow --- .../cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl index e78376d9426..23cd4dcfa0f 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl @@ -209,8 +209,8 @@ workflow CNVGermlineCaseScatteredWorkflow { Array[File] read_counts_entity_id = flatten(CNVGermlineCaseWorkflow.read_counts_entity_id) Array[File] read_counts = flatten(CNVGermlineCaseWorkflow.read_counts) Array[File] sample_contig_ploidy_calls_tars = flatten(CNVGermlineCaseWorkflow.sample_contig_ploidy_calls_tars) - Array[File] gcnv_calls_tars = CNVGermlineCaseWorkflow.gcnv_calls_tars - Array[File] gcnv_tracking_tars = CNVGermlineCaseWorkflow.gcnv_tracking_tars + Array[Array[File]] gcnv_calls_tars = CNVGermlineCaseWorkflow.gcnv_calls_tars + Array[Array[File]] gcnv_tracking_tars = CNVGermlineCaseWorkflow.gcnv_tracking_tars Array[File] genotyped_intervals_vcf = flatten(CNVGermlineCaseWorkflow.genotyped_intervals_vcf) Array[File] genotyped_segments_vcf = flatten(CNVGermlineCaseWorkflow.genotyped_segments_vcf) Array[File] denoised_copy_ratios = flatten(CNVGermlineCaseWorkflow.denoised_copy_ratios) From 70847757def8e187bd3c96cb3d9f86c86de085b4 Mon Sep 17 00:00:00 2001 From: Mark Walker Date: Fri, 10 Jul 2020 14:00:27 -0400 Subject: [PATCH 07/10] Revert back to 2d file array --- scripts/cnv_wdl/cnv_common_tasks.wdl | 110 ++++++------------ .../cnv_germline_case_scattered_workflow.wdl | 2 +- .../germline/cnv_germline_case_workflow.wdl | 41 ++++--- .../germline/cnv_germline_cohort_workflow.wdl | 45 +++---- 4 files changed, 87 insertions(+), 111 deletions(-) diff --git a/scripts/cnv_wdl/cnv_common_tasks.wdl b/scripts/cnv_wdl/cnv_common_tasks.wdl index 0f6e440ebf9..d43417a62dd 100644 --- a/scripts/cnv_wdl/cnv_common_tasks.wdl +++ b/scripts/cnv_wdl/cnv_common_tasks.wdl @@ -437,23 +437,26 @@ task ScatterIntervals { task PostprocessGermlineCNVCalls { input { - File gcnv_calls_sample_tar - Array[File] gcnv_model_tars - File gcnv_shard_configs_tar - String entity_id - File contig_ploidy_calls_tar - Array[String]? allosomal_contigs - Int ref_copy_number_autosomal_contigs - Int sample_index - File? gatk4_jar_override - - # Runtime parameters - String gatk_docker - Int? mem_gb - Int? disk_space_gb - Boolean use_ssd = false - Int? cpu - Int? preemptible_attempts + String entity_id + Array[File] gcnv_calls_tars + Array[File] gcnv_model_tars + Array[File] calling_configs + Array[File] denoising_configs + Array[File] gcnvkernel_version + Array[File] sharded_interval_lists + File contig_ploidy_calls_tar + Array[String]? allosomal_contigs + Int ref_copy_number_autosomal_contigs + Int sample_index + File? gatk4_jar_override + + # Runtime parameters + String gatk_docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? cpu + Int? preemptible_attempts } Int machine_mem_mb = select_first([mem_gb, 7]) * 1000 @@ -466,15 +469,28 @@ task PostprocessGermlineCNVCalls { Array[String] allosomal_contigs_args = if defined(allosomal_contigs) then prefix("--allosomal-contig ", select_first([allosomal_contigs])) else [] command <<< - set -euo pipefail + set -eu export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override} + sharded_interval_lists_array=(~{sep=" " sharded_interval_lists}) + # untar calls to CALLS_0, CALLS_1, etc directories and build the command line - tar xzf ~{gcnv_calls_sample_tar} - tar xzf ~{gcnv_shard_configs_tar} + # also copy over shard config and interval files + gcnv_calls_tar_array=(~{sep=" " gcnv_calls_tars}) + calling_configs_array=(~{sep=" " calling_configs}) + denoising_configs_array=(~{sep=" " denoising_configs}) + gcnvkernel_version_array=(~{sep=" " gcnvkernel_version}) + sharded_interval_lists_array=(~{sep=" " sharded_interval_lists}) calls_args="" - for calls_dir in CALLS_*; do - calls_args="$calls_args --calls-shard-path $calls_dir" + for index in ${!gcnv_calls_tar_array[@]}; do + gcnv_calls_tar=${gcnv_calls_tar_array[$index]} + mkdir -p CALLS_$index/SAMPLE_~{sample_index} + tar xzf $gcnv_calls_tar -C CALLS_$index/SAMPLE_~{sample_index} + cp ${calling_configs_array[$index]} CALLS_$index/ + cp ${denoising_configs_array[$index]} CALLS_$index/ + cp ${gcnvkernel_version_array[$index]} CALLS_$index/ + cp ${sharded_interval_lists_array[$index]} CALLS_$index/ + calls_args="$calls_args --calls-shard-path CALLS_$index" done # untar models to MODEL_0, MODEL_1, etc directories and build the command line @@ -512,7 +528,6 @@ task PostprocessGermlineCNVCalls { disks: "local-disk " + select_first([disk_space_gb, 40]) + if use_ssd then " SSD" else " HDD" cpu: select_first([cpu, 1]) preemptible: select_first([preemptible_attempts, 5]) - maxRetries: 1 } output { @@ -613,55 +628,6 @@ task CollectModelQualityMetrics { } } -task TransposeCallerOutputs { - input { - Array[File] gcnv_calls_tars - - # Runtime parameters - String docker - Int? mem_gb - Int? disk_space_gb - Boolean use_ssd = false - Int? cpu - Int? preemptible_attempts - } - - command <<< - set -euo pipefail - - gcnv_calls_tar_array=(~{sep=" " gcnv_calls_tars}) - for index in ${!gcnv_calls_tar_array[@]}; do - mkdir CALLS_$index - tar xzf ${gcnv_calls_tar_array[$index]} -C CALLS_$index - done - - CURRENT_SAMPLE=0 - NUM_SAMPLES=$(ls -d CALLS_0/SAMPLE_* | wc -l) - NUM_DIGITS=${#NUM_SAMPLES} - while [ $CURRENT_SAMPLE -lt $NUM_SAMPLES ]; do - CURRENT_SAMPLE_WITH_LEADING_ZEROS=$(printf "%0${NUM_DIGITS}d" $CURRENT_SAMPLE) - tar c CALLS_*/SAMPLE_$CURRENT_SAMPLE | gzip -1 > gcnv-calls-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz - let CURRENT_SAMPLE=CURRENT_SAMPLE+1 - done - - rm -r CALLS_*/SAMPLE_* - tar c CALLS_* | gzip -1 > gcnv-shard-configs.tar.gz - >>> - - runtime { - docker: docker - memory: select_first([mem_gb, 2]) + " GiB" - disks: "local-disk " + select_first([disk_space_gb, 150]) + if use_ssd then " SSD" else " HDD" - cpu: select_first([cpu, 1]) - preemptible: select_first([preemptible_attempts, 5]) - } - - output { - Array[File] gcnv_calls_sample_tars = glob("gcnv-calls-sample-*.tar.gz") - File gcnv_shard_configs_tar = "gcnv-shard-configs.tar.gz" - } -} - task ScatterPloidyCallsBySample { input { File contig_ploidy_calls_tar diff --git a/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl index 23cd4dcfa0f..4fd09289465 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl @@ -209,7 +209,7 @@ workflow CNVGermlineCaseScatteredWorkflow { Array[File] read_counts_entity_id = flatten(CNVGermlineCaseWorkflow.read_counts_entity_id) Array[File] read_counts = flatten(CNVGermlineCaseWorkflow.read_counts) Array[File] sample_contig_ploidy_calls_tars = flatten(CNVGermlineCaseWorkflow.sample_contig_ploidy_calls_tars) - Array[Array[File]] gcnv_calls_tars = CNVGermlineCaseWorkflow.gcnv_calls_tars + Array[Array[Array[File]]] gcnv_calls_tars = CNVGermlineCaseWorkflow.gcnv_calls_tars Array[Array[File]] gcnv_tracking_tars = CNVGermlineCaseWorkflow.gcnv_tracking_tars Array[File] genotyped_intervals_vcf = flatten(CNVGermlineCaseWorkflow.genotyped_intervals_vcf) Array[File] genotyped_segments_vcf = flatten(CNVGermlineCaseWorkflow.genotyped_segments_vcf) diff --git a/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl index f127044f2f2..0131115f9c6 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl @@ -232,32 +232,25 @@ workflow CNVGermlineCaseWorkflow { } } - call CNVTasks.TransposeCallerOutputs { - input: - gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar, - docker = gatk_docker, - mem_gb = mem_gb_for_transpose_caller_outputs, - disk_space_gb = disk_space_gb_for_transpose_caller_outputs, - preemptible_attempts = preemptible_attempts - } + Array[Array[File]] call_tars_sample_by_shard = transpose(GermlineCNVCallerCaseMode.gcnv_call_tars) scatter (sample_index in range(length(normal_bams))) { - call CNVTasks.PostprocessGermlineCNVCalls { input: - gcnv_calls_sample_tar = TransposeCallerOutputs.gcnv_calls_sample_tars[sample_index], - gcnv_model_tars = gcnv_model_tars, - gcnv_shard_configs_tar = TransposeCallerOutputs.gcnv_shard_configs_tar, entity_id = CollectCounts.entity_id[sample_index], + gcnv_calls_tars = call_tars_sample_by_shard[sample_index], + gcnv_model_tars = gcnv_model_tars, + calling_configs = GermlineCNVCallerCaseMode.calling_config_json, + denoising_configs = GermlineCNVCallerCaseMode.denoising_config_json, + gcnvkernel_version = GermlineCNVCallerCaseMode.gcnvkernel_version_json, + sharded_interval_lists = GermlineCNVCallerCaseMode.sharded_interval_list, allosomal_contigs = allosomal_contigs, ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs, contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar, sample_index = sample_index, gatk4_jar_override = gatk4_jar_override, gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - mem_gb = mem_gb_for_postprocess_germline_cnv_calls, - disk_space_gb = disk_space_gb_for_postprocess_germline_cnv_calls + preemptible_attempts = preemptible_attempts } call CNVTasks.CollectSampleQualityMetrics { @@ -283,7 +276,7 @@ workflow CNVGermlineCaseWorkflow { Array[File] read_counts_entity_id = CollectCounts.entity_id Array[File] read_counts = CollectCounts.counts Array[File] sample_contig_ploidy_calls_tars = ScatterPloidyCallsBySample.sample_contig_ploidy_calls_tar - Array[File] gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar + Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_call_tars Array[File] gcnv_tracking_tars = GermlineCNVCallerCaseMode.gcnv_tracking_tar Array[File] genotyped_intervals_vcf = PostprocessGermlineCNVCalls.genotyped_intervals_vcf Array[File] genotyped_segments_vcf = PostprocessGermlineCNVCalls.genotyped_segments_vcf @@ -465,9 +458,17 @@ task GermlineCNVCallerCaseMode { --caller-external-admixing-rate ~{default="1.00" caller_external_admixing_rate} \ --disable-annealing ~{default="false" disable_annealing} - tar czf case-gcnv-calls-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/case-calls . tar czf case-gcnv-tracking-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/case-tracking . + CURRENT_SAMPLE=0 + NUM_SAMPLES=~{num_samples} + NUM_DIGITS=${#NUM_SAMPLES} + while [ $CURRENT_SAMPLE -lt $NUM_SAMPLES ]; do + CURRENT_SAMPLE_WITH_LEADING_ZEROS=$(printf "%0${NUM_DIGITS}d" $CURRENT_SAMPLE) + tar czf case-gcnv-calls-shard-~{scatter_index}-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz -C ~{output_dir_}/case-calls/SAMPLE_$CURRENT_SAMPLE . + let CURRENT_SAMPLE=CURRENT_SAMPLE+1 + done + rm -rf contig-ploidy-calls rm -rf gcnv-model >>> @@ -481,7 +482,11 @@ task GermlineCNVCallerCaseMode { } output { - File gcnv_calls_tar = "case-gcnv-calls-shard-~{scatter_index}.tar.gz" + Array[File] gcnv_call_tars = glob("case-gcnv-calls-shard-~{scatter_index}-sample-*.tar.gz") File gcnv_tracking_tar = "case-gcnv-tracking-shard-~{scatter_index}.tar.gz" + File calling_config_json = "~{output_dir_}/case-calls/calling_config.json" + File denoising_config_json = "~{output_dir_}/case-calls/denoising_config.json" + File gcnvkernel_version_json = "~{output_dir_}/case-calls/gcnvkernel_version.json" + File sharded_interval_list = "~{output_dir_}/case-calls/interval_list.tsv" } } diff --git a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl index 3b68a555fba..aa91bed87d0 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl @@ -325,31 +325,26 @@ workflow CNVGermlineCohortWorkflow { } } - call CNVTasks.TransposeCallerOutputs { - input: - gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar, - docker = gatk_docker, - mem_gb = mem_gb_for_transpose_caller_outputs, - disk_space_gb = disk_space_gb_for_transpose_caller_outputs, - preemptible_attempts = preemptible_attempts - } - scatter (sample_index in range(length(normal_bams))) { + Array[Array[File]] call_tars_sample_by_shard = transpose(GermlineCNVCallerCohortMode.gcnv_call_tars) + + scatter (sample_index in range(length(CollectCounts.entity_id))) { call CNVTasks.PostprocessGermlineCNVCalls { input: - gcnv_calls_sample_tar = TransposeCallerOutputs.gcnv_calls_sample_tars[sample_index], - gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar, - gcnv_shard_configs_tar = TransposeCallerOutputs.gcnv_shard_configs_tar, entity_id = CollectCounts.entity_id[sample_index], + gcnv_calls_tars = call_tars_sample_by_shard[sample_index], + gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar, + calling_configs = GermlineCNVCallerCohortMode.calling_config_json, + denoising_configs = GermlineCNVCallerCohortMode.denoising_config_json, + gcnvkernel_version = GermlineCNVCallerCohortMode.gcnvkernel_version_json, + sharded_interval_lists = GermlineCNVCallerCohortMode.sharded_interval_list, + contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar, allosomal_contigs = allosomal_contigs, ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs, - contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar, sample_index = sample_index, gatk4_jar_override = gatk4_jar_override, gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - mem_gb = mem_gb_for_postprocess_germline_cnv_calls, - disk_space_gb = disk_space_gb_for_postprocess_germline_cnv_calls + preemptible_attempts = preemptible_attempts } call CNVTasks.CollectSampleQualityMetrics { @@ -386,7 +381,7 @@ workflow CNVGermlineCohortWorkflow { File contig_ploidy_model_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_model_tar Array[File] sample_contig_ploidy_calls_tars = ScatterPloidyCallsBySample.sample_contig_ploidy_calls_tar Array[File] gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar - Array[File] gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar + Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_call_tars Array[File] gcnv_tracking_tars = GermlineCNVCallerCohortMode.gcnv_tracking_tar Array[File] genotyped_intervals_vcfs = PostprocessGermlineCNVCalls.genotyped_intervals_vcf Array[File] genotyped_segments_vcfs = PostprocessGermlineCNVCalls.genotyped_segments_vcf @@ -537,6 +532,8 @@ task GermlineCNVCallerCohortMode { String output_dir_ = select_first([output_dir, "out"]) Int num_samples = length(read_count_files) + String dollar = "$" #WDL workaround, see https://github.com/broadinstitute/cromwell/issues/1819 + command <<< set -eu export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override} @@ -594,9 +591,17 @@ task GermlineCNVCallerCohortMode { --caller-external-admixing-rate ~{default="1.00" caller_external_admixing_rate} \ --disable-annealing ~{default="false" disable_annealing} + tar czf ~{cohort_entity_id}-gcnv-model-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-model . tar czf ~{cohort_entity_id}-gcnv-tracking-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-tracking . - tar czf ~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-calls . - tar czf ~{cohort_entity_id}-gcnv-model-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-model . + + CURRENT_SAMPLE=0 + NUM_SAMPLES=~{num_samples} + NUM_DIGITS=${#NUM_SAMPLES} + while [ $CURRENT_SAMPLE -lt $NUM_SAMPLES ]; do + CURRENT_SAMPLE_WITH_LEADING_ZEROS=$(printf "%0${NUM_DIGITS}d" $CURRENT_SAMPLE) + tar czf ~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-calls/SAMPLE_$CURRENT_SAMPLE . + let CURRENT_SAMPLE=CURRENT_SAMPLE+1 + done rm -rf contig-ploidy-calls >>> @@ -611,7 +616,7 @@ task GermlineCNVCallerCohortMode { output { File gcnv_model_tar = "~{cohort_entity_id}-gcnv-model-shard-~{scatter_index}.tar.gz" - File gcnv_calls_tar = "~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}.tar.gz" + Array[File] gcnv_call_tars = glob("~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}-sample-*.tar.gz") File gcnv_tracking_tar = "~{cohort_entity_id}-gcnv-tracking-shard-~{scatter_index}.tar.gz" File calling_config_json = "~{output_dir_}/~{cohort_entity_id}-calls/calling_config.json" File denoising_config_json = "~{output_dir_}/~{cohort_entity_id}-calls/denoising_config.json" From 57680ecd55d48bfe4389765952f70e89cdf8e6eb Mon Sep 17 00:00:00 2001 From: Mark Walker Date: Wed, 29 Jul 2020 11:23:59 -0400 Subject: [PATCH 08/10] Delete transpose task inputs --- .../germline/cnv_germline_case_scattered_workflow.wdl | 6 ------ scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl | 6 ------ scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl | 7 ------- 3 files changed, 19 deletions(-) diff --git a/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl index 4fd09289465..427bd52c8d4 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl @@ -102,12 +102,6 @@ workflow CNVGermlineCaseScatteredWorkflow { Float? gcnv_caller_external_admixing_rate Boolean? gcnv_disable_annealing - ############################################## - #### arguments for TransposeCallerOutputs #### - ############################################## - Int? mem_gb_for_transpose_caller_outputs - Int? disk_space_gb_for_transpose_caller_outputs - ################################################### #### arguments for PostprocessGermlineCNVCalls #### ################################################### diff --git a/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl index 0131115f9c6..67f1c9b5932 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl @@ -112,12 +112,6 @@ workflow CNVGermlineCaseWorkflow { Float? gcnv_caller_external_admixing_rate Boolean? gcnv_disable_annealing - ############################################## - #### arguments for TransposeCallerOutputs #### - ############################################## - Int? mem_gb_for_transpose_caller_outputs - Int? disk_space_gb_for_transpose_caller_outputs - ################################################### #### arguments for PostprocessGermlineCNVCalls #### ################################################### diff --git a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl index aa91bed87d0..5f92933b834 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl @@ -149,12 +149,6 @@ workflow CNVGermlineCohortWorkflow { Float? gcnv_caller_external_admixing_rate Boolean? gcnv_disable_annealing - ############################################## - #### arguments for TransposeCallerOutputs #### - ############################################## - Int? mem_gb_for_transpose_caller_outputs - Int? disk_space_gb_for_transpose_caller_outputs - ################################################### #### arguments for PostprocessGermlineCNVCalls #### ################################################### @@ -325,7 +319,6 @@ workflow CNVGermlineCohortWorkflow { } } - Array[Array[File]] call_tars_sample_by_shard = transpose(GermlineCNVCallerCohortMode.gcnv_call_tars) scatter (sample_index in range(length(CollectCounts.entity_id))) { From d24b40bb2ea03ab0ba0a07e2615948c13e66991b Mon Sep 17 00:00:00 2001 From: Mark Walker Date: Mon, 10 Aug 2020 10:28:34 -0400 Subject: [PATCH 09/10] Fix minor error --- .../cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl index 427bd52c8d4..3991fea48f5 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl @@ -191,8 +191,6 @@ workflow CNVGermlineCaseScatteredWorkflow { gcnv_caller_external_admixing_rate = gcnv_caller_external_admixing_rate, gcnv_disable_annealing = gcnv_disable_annealing, ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs, - mem_gb_for_transpose_caller_outputs = mem_gb_for_transpose_caller_outputs, - disk_space_gb_for_transpose_caller_outputs = disk_space_gb_for_transpose_caller_outputs, allosomal_contigs = allosomal_contigs, maximum_number_events_per_sample = maximum_number_events_per_sample } From aa66473cda7763e62dcea1c1ff1f6d9a04e022f9 Mon Sep 17 00:00:00 2001 From: Mark Walker Date: Tue, 11 Aug 2020 12:23:18 -0400 Subject: [PATCH 10/10] Add space --- scripts/cnv_wdl/cnv_common_tasks.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/cnv_wdl/cnv_common_tasks.wdl b/scripts/cnv_wdl/cnv_common_tasks.wdl index d43417a62dd..def1e031479 100644 --- a/scripts/cnv_wdl/cnv_common_tasks.wdl +++ b/scripts/cnv_wdl/cnv_common_tasks.wdl @@ -663,6 +663,7 @@ task ScatterPloidyCallsBySample { tar -czf sample_${padded_sample_index}.${sample_id}.contig_ploidy_calls.tar.gz -C calls/SAMPLE_${i} . done >>> + runtime { docker: docker memory: select_first([mem_gb, 2]) + " GiB"